diff --git hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java index a9c3136..1b96232 100644 --- hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java +++ hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStatsAggregator.java @@ -32,7 +32,7 @@ import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.PrefixFilter; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.stats.StatsAggregator; @@ -47,7 +47,7 @@ /** * Does the necessary HBase initializations. */ - public boolean connect(Configuration hiveconf, MapRedTask sourceTask) { + public boolean connect(Configuration hiveconf, Task sourceTask) { try { htable = new HTable(HBaseConfiguration.create(hiveconf), diff --git metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index d0e94bf..22c98df 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -282,7 +282,7 @@ public static boolean updatePartitionStatsFast(Partition part, Warehouse wh, // The partitition location already existed and may contain data. Lets try to // populate those statistics that don't require a full scan of the data. LOG.warn("Updating partition stats fast for: " + part.getTableName()); - FileStatus[] fileStatus = wh.getFileStatusesForPartition(part); + FileStatus[] fileStatus = wh.getFileStatusesForSD(part.getSd()); params.put(StatsSetupConst.NUM_FILES, Integer.toString(fileStatus.length)); long partSize = 0L; for (int i = 0; i < fileStatus.length; i++) { diff --git metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java index c43145b..f731dab 100755 --- metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/Warehouse.java @@ -53,6 +53,7 @@ import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.SkewedInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.security.UserGroupInformation; @@ -500,16 +501,17 @@ public static String makePartName(List partCols, } /** - * @param partn - * @return array of FileStatus objects corresponding to the files making up the passed partition + * @param desc + * @return array of FileStatus objects corresponding to the files + * making up the passed storage description */ - public FileStatus[] getFileStatusesForPartition(Partition partn) + public FileStatus[] getFileStatusesForSD(StorageDescriptor desc) throws MetaException { try { - Path path = new Path(partn.getSd().getLocation()); + Path path = new Path(desc.getLocation()); FileSystem fileSys = path.getFileSystem(conf); /* consider sub-directory created from list bucketing. */ - int listBucketingDepth = calculateListBucketingDMLDepth(partn); + int listBucketingDepth = calculateListBucketingDMLDepth(desc); return HiveStatsUtils.getFileStatusRecurse(path, (1 + listBucketingDepth), fileSys); } catch (IOException ioe) { MetaStoreUtils.logAndThrowMetaException(ioe); @@ -521,13 +523,13 @@ public static String makePartName(List partCols, * List bucketing will introduce sub-directories. * calculate it here in order to go to the leaf directory * so that we can count right number of files. - * @param partn + * @param desc * @return */ - private static int calculateListBucketingDMLDepth(Partition partn) { + private static int calculateListBucketingDMLDepth(StorageDescriptor desc) { // list bucketing will introduce more files int listBucketingDepth = 0; - SkewedInfo skewedInfo = partn.getSd().getSkewedInfo(); + SkewedInfo skewedInfo = desc.getSkewedInfo(); if ((skewedInfo != null) && (skewedInfo.getSkewedColNames() != null) && (skewedInfo.getSkewedColNames().size() > 0) && (skewedInfo.getSkewedColValues() != null) diff --git ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java index cbc3cd2..09b514b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java +++ ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java @@ -413,7 +413,16 @@ + "for partition columns"), STATISTICS_CLONING_FAILED(30013, "Cloning of statistics failed"), - ; + + STATSAGGREGATOR_SOURCETASK_NULL(30014, "SourceTask for StatsAggregator should not be null"), + + STATSAGGREGATOR_CONNECTION_ERROR(30015, "StatsAggregator cannot be connected to." + + "There was a error while connecting to the StatsAggregator, and retrying " + + "might help. If you don't want the query to fail because accurate statistics " + + "could not be collected, set hive.stats.reliable=false"), + STATSAGGREGATOR_MISSED_SOMESTATS(30016, "Stats {0} is not collected via StatsAggregator. " + + "If you don't want the query to fail because of this, set hive.stats.atomic=false", true) + ; private int errorCode; private String mesg; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index bd2c4a8..ed7fa65 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -967,35 +967,36 @@ private void publishStats() throws HiveException { } String taskID = Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf)); - String spSpec = conf.getStaticSpec() != null ? conf.getStaticSpec() : ""; - - for (String fspKey : valToPaths.keySet()) { - FSPaths fspValue = valToPaths.get(fspKey); - String key; - - // construct the key(fileID) to insert into the intermediate stats table - if (fspKey == "") { - if (statsPublisher instanceof CounterStatsPublisher) { - // key is of form either of dbName.TblName/ or dbName.TblName/p1=v1/ - key = Utilities.appendPathSeparator(conf.getTableInfo().getTableName() + Path.SEPARATOR + spSpec); - } else { - // for non-partitioned/static partitioned table, the key for temp storage is - // common key prefix + static partition spec + taskID - String keyPrefix = Utilities.getHashedStatsPrefix( - conf.getStatsAggPrefix() + spSpec, conf.getMaxStatsKeyPrefixLength()); - key = keyPrefix + taskID; - } + String spSpec = conf.getStaticSpec(); + + int maxKeyLength = conf.getMaxStatsKeyPrefixLength(); + boolean counterStats = statsPublisher instanceof CounterStatsPublisher; + + for (Map.Entry entry : valToPaths.entrySet()) { + String fspKey = entry.getKey(); // DP/LB + FSPaths fspValue = entry.getValue(); + + // split[0] = DP, split[1] = LB + String[] split = splitKey(fspKey); + String dpSpec = split[0]; + String lbSpec = split[1]; + + String prefix; + String postfix; + if (counterStats) { + // key = "database.table/SP/DP/"LB/ + prefix = conf.getTableInfo().getTableName(); + postfix = Utilities.join(lbSpec); } else { - if (statsPublisher instanceof CounterStatsPublisher) { - // key is of form either of dbName.TblName/p1=v1/ - key = Utilities.appendPathSeparator(Utilities.appendPathSeparator( - conf.getTableInfo().getTableName() + Path.SEPARATOR + spSpec) + fspKey); - } else { - // for partitioned table, the key is - // common key prefix + static partition spec + DynamicPartSpec + taskID - key = createKeyForStatsPublisher(taskID, spSpec, fspKey); - } + // key = "prefix/SP/DP/"LB/taskID/ + prefix = conf.getStatsAggPrefix(); + postfix = Utilities.join(lbSpec, taskID); } + prefix = Utilities.join(prefix, spSpec, dpSpec); + prefix = Utilities.getHashedStatsPrefix(prefix, maxKeyLength, postfix.length()); + + String key = Utilities.join(prefix, postfix); + Map statsToPublish = new HashMap(); for (String statType : fspValue.stat.getStoredStats()) { statsToPublish.put(statType, Long.toString(fspValue.stat.getStat(statType))); @@ -1051,29 +1052,20 @@ private void publishStats() throws HiveException { * key=484/value=val_484 or * HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME/HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME * so, at the end, "keyPrefix" doesn't have subdir information from skewed but "key" has - * @param taskID - * @param spSpec - * @param fspKey - * @return + * + * In a word, fspKey is consists of DP(dynamic partition spec) + LB(list bucketing spec) + * In stats publishing, full partition spec consists of prefix part of stat key + * but list bucketing spec is regarded as a postfix of stat key. So we split it here. */ - private String createKeyForStatsPublisher(String taskID, String spSpec, String fspKey) { - String key; - String newFspKey = fspKey; - String storedAsDirPostFix = ""; - if (isSkewedStoredAsSubDirectories) { - List skewedValueDirList = this.lbCtx.getSkewedValuesDirNames(); - for (String dir : skewedValueDirList) { - newFspKey = newFspKey.replace(dir, ""); - if (!newFspKey.equals(fspKey)) { - storedAsDirPostFix = dir; - break; + private String[] splitKey(String fspKey) { + if (!fspKey.isEmpty() && isSkewedStoredAsSubDirectories) { + for (String dir : lbCtx.getSkewedValuesDirNames()) { + int index = fspKey.indexOf(dir); + if (index >= 0) { + return new String[] {fspKey.substring(0, index), fspKey.substring(index + 1)}; } } } - String keyPrefix = Utilities.getHashedStatsPrefix( - conf.getStatsAggPrefix() + spSpec + newFspKey, - conf.getMaxStatsKeyPrefixLength()); - key = keyPrefix + storedAsDirPostFix + taskID; - return key; + return new String[] {fspKey, null}; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java index a2ecc80..9331a4b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java @@ -21,7 +21,6 @@ import java.io.Serializable; import java.util.ArrayList; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -29,11 +28,11 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.ql.DriverContext; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -44,10 +43,10 @@ import org.apache.hadoop.hive.ql.plan.LoadTableDesc; import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.api.StageType; +import org.apache.hadoop.hive.ql.stats.CounterStatsAggregator; import org.apache.hadoop.hive.ql.stats.StatsAggregator; import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.ql.stats.StatsPublisher; -import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.util.StringUtils; /** @@ -71,44 +70,6 @@ public StatsTask() { dpPartSpecs = null; } - /** - * - * Statistics for a Partition or Unpartitioned Table - * - */ - class Statistics { - Map stats; - - public Statistics() { - stats = new HashMap(); - for (String statType : StatsSetupConst.supportedStats) { - stats.put(statType, new LongWritable(0L)); - } - } - - public Statistics(Map st) { - stats = new HashMap(); - for (String statType : st.keySet()) { - Long stValue = st.get(statType) == null ? 0L : st.get(statType); - stats.put(statType, new LongWritable(stValue)); - } - } - - public long getStat(String statType) { - return stats.get(statType) == null ? 0L : stats.get(statType).get(); - } - - public void setStat(String statType, long value) { - stats.put(statType, new LongWritable(value)); - } - - - @Override - public String toString() { - return org.apache.commons.lang.StringUtils.join(StatsSetupConst.supportedStats, ", "); - } - } - @Override protected void receiveFeed(FeedType feedType, Object feedValue) { // this method should be called by MoveTask when there are dynamic partitions generated @@ -175,176 +136,93 @@ private int aggregateStats() { try { // Stats setup: Warehouse wh = new Warehouse(conf); - String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS); - if (!this.getWork().getNoStatsAggregator()) { - StatsFactory factory = StatsFactory.newFactory(statsImplementationClass, conf); - if (factory != null && work.isNoScanAnalyzeCommand()){ - // initialize stats publishing table for noscan which has only stats task - // the rest of MR task following stats task initializes it in ExecDriver.java - StatsPublisher statsPublisher = factory.getStatsPublisher(); - if (!statsPublisher.init(conf)) { // creating stats table if not exists - if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) { - throw - new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg()); - } + if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) { + try { + statsAggregator = createStatsAggregator(conf); + } catch (HiveException e) { + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) { + throw e; } - } - if (factory != null) { - statsAggregator = factory.getStatsAggregator(); - // manufacture a StatsAggregator - if (!statsAggregator.connect(conf, getWork().getSourceTask())) { - throw new HiveException("StatsAggregator connect failed " + statsImplementationClass); - } - } - } - - Statistics tblStats = new Statistics(); - - org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); - Map parameters = tTable.getParameters(); - - boolean tableStatsExist = this.existStats(parameters); - - for (String statType : StatsSetupConst.supportedStats) { - if (parameters.containsKey(statType)) { - tblStats.setStat(statType, Long.parseLong(parameters.get(statType))); + console.printError("skipping stat aggregation by exception " + e); } } List partitions = getPartitionsList(); boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC); - int maxPrefixLength = HiveConf.getIntVar(conf, - HiveConf.ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH); + String tableFullName = table.getDbName() + "." + table.getTableName(); + int maxPrefixLength = StatsFactory.getMaxPrefixLength(conf); + + // "counter" type does not need to collect stats per task + boolean counterStat = statsAggregator instanceof CounterStatsAggregator; if (partitions == null) { + org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); + Map parameters = tTable.getParameters(); // non-partitioned tables: - if (!tableStatsExist && atomic) { + if (!existStats(parameters) && atomic) { return 0; } - if (statsAggregator != null) { - String aggKey; - if (statsImplementationClass.equals("counter")) { - // Key is of the form dbName.tblName/ - aggKey = tableFullName+Path.SEPARATOR; - } else { - // In case of a non-partitioned table, the key for stats temporary store is "rootDir" - aggKey = Utilities.getHashedStatsPrefix(work.getAggKey(), maxPrefixLength); - } - updateStats(StatsSetupConst.statsRequireCompute, tblStats, statsAggregator, parameters, - aggKey, atomic); - statsAggregator.cleanUp(aggKey); - } // The collectable stats for the aggregator needs to be cleared. // For eg. if a file is being loaded, the old number of rows are not valid - else if (work.isClearAggregatorStats()) { - for (String statType : StatsSetupConst.statsRequireCompute) { - if (parameters.containsKey(statType)) { - tblStats.setStat(statType, 0L); - } - } + if (work.isClearAggregatorStats()) { + clearStats(parameters); } - // write table stats to metastore - parameters = tTable.getParameters(); - for (String statType : StatsSetupConst.statsRequireCompute) { - parameters.put(statType, Long.toString(tblStats.getStat(statType))); + if (statsAggregator != null) { + String prefix = getAggregationPrefix(counterStat, table, null); + String aggKey = Utilities.getHashedStatsPrefix(prefix, maxPrefixLength, 0); + updateStats(statsAggregator, parameters, aggKey, atomic); + statsAggregator.cleanUp(aggKey); } + + updateQuickStats(wh, parameters, tTable.getSd()); + + // write table stats to metastore parameters.put(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK, StatsSetupConst.TRUE); - tTable.setParameters(parameters); + db.alterTable(tableFullName, new Table(tTable)); - console.printInfo("Table " + tableFullName + " stats: [" + tblStats.toString() + ']'); + console.printInfo("Table " + tableFullName + " stats: [" + toString(parameters) + ']'); } else { // Partitioned table: // Need to get the old stats of the partition // and update the table stats based on the old and new stats. + List updates = new ArrayList(); for (Partition partn : partitions) { // // get the old partition stats // org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); - parameters = tPart.getParameters(); - - boolean hasStats = this.existStats(parameters); - if (!hasStats && atomic) { + Map parameters = tPart.getParameters(); + if (!existStats(parameters) && atomic) { continue; } - Map currentValues = new HashMap(); - for (String statType : StatsSetupConst.supportedStats) { - Long val = parameters.containsKey(statType) ? Long.parseLong(parameters.get(statType)) - : 0L; - currentValues.put(statType, val); - } - - // - // get the new partition stats - // - Statistics newPartStats = new Statistics(); - String partitionID; - - if (statsImplementationClass.equals("counter")) { - // stat-Agg-key is of form : dbName.tblName/p1=v1/p2=val2/ - partitionID = Utilities.appendPathSeparator(tableFullName + Path.SEPARATOR + - Warehouse.makePartPath(partn.getSpec())); - // there is no need to aggregate stats in this case, but this should also work. - // also check non-partitioned code path. - } else { - // In that case of a partition, the key for stats temporary store is - // "rootDir/[dynamic_partition_specs/]%" - partitionID = Utilities.getHashedStatsPrefix( - work.getAggKey() + Warehouse.makePartPath(partn.getSpec()), maxPrefixLength); + // The collectable stats for the aggregator needs to be cleared. + // For eg. if a file is being loaded, the old number of rows are not valid + if (work.isClearAggregatorStats()) { + clearStats(parameters); } - LOG.info("Stats aggregator : " + partitionID); if (statsAggregator != null) { - updateStats(StatsSetupConst.statsRequireCompute, newPartStats, statsAggregator, - parameters, partitionID, atomic); - statsAggregator.cleanUp(partitionID); - } else { - for (String statType : StatsSetupConst.statsRequireCompute) { - // The collectable stats for the aggregator needs to be cleared. - // For eg. if a file is being loaded, the old number of rows are not valid - if (work.isClearAggregatorStats()) { - if (parameters.containsKey(statType)) { - newPartStats.setStat(statType, 0L); - } - } - else { - newPartStats.setStat(statType, currentValues.get(statType)); - } - } - } - /** - * calculate fast statistics - */ - FileStatus[] partfileStatus = wh.getFileStatusesForPartition(tPart); - newPartStats.setStat(StatsSetupConst.NUM_FILES, partfileStatus.length); - long partSize = 0L; - for (int i = 0; i < partfileStatus.length; i++) { - partSize += partfileStatus[i].getLen(); + String prefix = getAggregationPrefix(counterStat, table, partn); + String aggKey = Utilities.getHashedStatsPrefix(prefix, maxPrefixLength, 0); + updateStats(statsAggregator, parameters, aggKey, atomic); + statsAggregator.cleanUp(aggKey); } - newPartStats.setStat(StatsSetupConst.TOTAL_SIZE, partSize); - // - // update the metastore - // - for (String statType : StatsSetupConst.supportedStats) { - long statValue = newPartStats.getStat(statType); - if (statValue >= 0) { - parameters.put(statType, Long.toString(newPartStats.getStat(statType))); - } - } + updateQuickStats(wh, parameters, tPart.getSd()); parameters.put(StatsSetupConst.STATS_GENERATED_VIA_STATS_TASK, StatsSetupConst.TRUE); - tPart.setParameters(parameters); - db.alterPartition(tableFullName, new Partition(table, tPart)); + updates.add(new Partition(table, tPart)); console.printInfo("Partition " + tableFullName + partn.getSpec() + - " stats: [" + newPartStats.toString() + ']'); + " stats: [" + toString(parameters) + ']'); + } + if (!updates.isEmpty()) { + db.alterPartitions(tableFullName, updates); } - } } catch (Exception e) { @@ -366,6 +244,49 @@ else if (work.isClearAggregatorStats()) { return ret; } + private String getAggregationPrefix(boolean counter, Table table, Partition partition) + throws MetaException { + if (!counter && partition == null) { + return work.getAggKey(); + } + StringBuilder prefix = new StringBuilder(); + if (counter) { + // prefix is of the form dbName.tblName + prefix.append(table.getDbName()).append('.').append(table.getTableName()); + } else { + // In case of a non-partitioned table, the key for stats temporary store is "rootDir" + prefix.append(work.getAggKey()); + } + if (partition != null) { + return Utilities.join(prefix.toString(), Warehouse.makePartPath(partition.getSpec())); + } + return prefix.toString(); + } + + private StatsAggregator createStatsAggregator(HiveConf conf) throws HiveException { + String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS); + StatsFactory factory = StatsFactory.newFactory(statsImplementationClass, conf); + if (factory == null) { + throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg()); + } + // initialize stats publishing table for noscan which has only stats task + // the rest of MR task following stats task initializes it in ExecDriver.java + StatsPublisher statsPublisher = factory.getStatsPublisher(); + if (!statsPublisher.init(conf)) { // creating stats table if not exists + throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg()); + } + Task sourceTask = getWork().getSourceTask(); + if (sourceTask == null) { + throw new HiveException(ErrorMsg.STATSAGGREGATOR_SOURCETASK_NULL.getErrorCodedMsg()); + } + // manufacture a StatsAggregator + StatsAggregator statsAggregator = factory.getStatsAggregator(); + if (!statsAggregator.connect(conf, sourceTask)) { + throw new HiveException(ErrorMsg.STATSAGGREGATOR_CONNECTION_ERROR.getErrorCodedMsg()); + } + return statsAggregator; + } + private boolean existStats(Map parameters) { return parameters.containsKey(StatsSetupConst.ROW_COUNT) || parameters.containsKey(StatsSetupConst.NUM_FILES) @@ -374,31 +295,64 @@ private boolean existStats(Map parameters) { || parameters.containsKey(StatsSetupConst.NUM_PARTITIONS); } - private void updateStats(String[] statsList, Statistics stats, - StatsAggregator statsAggregator, Map parameters, - String aggKey, boolean atomic) throws HiveException { + private void updateStats(StatsAggregator statsAggregator, + Map parameters, String aggKey, boolean atomic) throws HiveException { - String value; - Long longValue; - for (String statType : statsList) { - value = statsAggregator.aggregateStats(aggKey, statType); - if (value != null) { - longValue = Long.parseLong(value); + for (String statType : StatsSetupConst.statsRequireCompute) { + String value = statsAggregator.aggregateStats(aggKey, statType); + if (value != null && !value.isEmpty()) { + long longValue = Long.parseLong(value); if (work.getLoadTableDesc() != null && !work.getLoadTableDesc().getReplace()) { String originalValue = parameters.get(statType); if (originalValue != null && !originalValue.equals("-1")) { - longValue += Long.parseLong(originalValue); + longValue += Long.parseLong(originalValue); // todo: invalid + valid = invalid } } - stats.setStat(statType, longValue); + parameters.put(statType, String.valueOf(longValue)); } else { if (atomic) { - throw new HiveException("StatsAggregator failed to get statistics."); + throw new HiveException(ErrorMsg.STATSAGGREGATOR_MISSED_SOMESTATS, statType); + } + } + } + } + + private void updateQuickStats(Warehouse wh, Map parameters, + StorageDescriptor desc) throws MetaException { + /** + * calculate fast statistics + */ + FileStatus[] partfileStatus = wh.getFileStatusesForSD(desc); + parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(partfileStatus.length)); + long partSize = 0L; + for (int i = 0; i < partfileStatus.length; i++) { + partSize += partfileStatus[i].getLen(); + } + parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(partSize)); + } + + private void clearStats(Map parameters) { + for (String statType : StatsSetupConst.supportedStats) { + if (parameters.containsKey(statType)) { + parameters.put(statType, "0"); + } + } + } + + private String toString(Map parameters) { + StringBuilder builder = new StringBuilder(); + for (String statType : StatsSetupConst.supportedStats) { + String value = parameters.get(statType); + if (value != null) { + if (builder.length() > 0) { + builder.append(", "); } + builder.append(statType).append('=').append(value); } } + return builder.toString(); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index 46d88ce..0e3cfe7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -28,8 +28,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.StatsSetupConst; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; @@ -286,34 +284,21 @@ private void publishStats() throws HiveException { return; } - String key; String taskID = Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf)); Map statsToPublish = new HashMap(); for (String pspecs : stats.keySet()) { statsToPublish.clear(); - if (pspecs.isEmpty()) { - if (statsPublisher instanceof CounterStatsPublisher) { - // key is of form : dbName.TblName/ - key = conf.getStatsAggPrefix(); - } else { - // In case of a non-partitioned table, the key for temp storage is just - // "tableName + taskID" - String keyPrefix = Utilities.getHashedStatsPrefix( - conf.getStatsAggPrefix(), conf.getMaxStatsKeyPrefixLength()); - key = keyPrefix + taskID; - } + String prefix = Utilities.join(conf.getStatsAggPrefix(), pspecs); + + String key; + int maxKeyLength = conf.getMaxStatsKeyPrefixLength(); + if (statsPublisher instanceof CounterStatsPublisher) { + key = Utilities.getHashedStatsPrefix(prefix, maxKeyLength, 0); } else { - if (statsPublisher instanceof CounterStatsPublisher) { - // key is of form : dbName.tblName/p1=v1/ - key = Utilities.appendPathSeparator(conf.getStatsAggPrefix()+pspecs); - } else { - // In case of a partition, the key for temp storage is - // "tableName + partitionSpecs + taskID" - String keyPrefix = Utilities.getHashedStatsPrefix( - conf.getStatsAggPrefix() + pspecs, conf.getMaxStatsKeyPrefixLength()); - key = keyPrefix + taskID; - } + // stats publisher except counter type needs postfix 'taskID' + prefix = Utilities.getHashedStatsPrefix(prefix, maxKeyLength, taskID.length()); + key = prefix + taskID; } for(String statType : stats.get(pspecs).getStoredStats()) { statsToPublish.put(statType, Long.toString(stats.get(pspecs).getStat(statType))); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 197a20f..0934a91 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -2284,25 +2284,35 @@ public static StatsPublisher getStatsPublisher(JobConf jc) { * @param maxPrefixLength * @return */ - public static String getHashedStatsPrefix(String statsPrefix, int maxPrefixLength) { - String ret = appendPathSeparator(statsPrefix); - if (maxPrefixLength >= 0 && statsPrefix.length() > maxPrefixLength) { + public static String getHashedStatsPrefix(String statsPrefix, + int maxPrefixLength, int postfixLength) { + // todo: this might return possibly longer prefix than + // maxPrefixLength (if set) when maxPrefixLength - postfixLength < 17, + // which would make stat values invalid (especially for 'counter' type) + if (maxPrefixLength >= 0 && statsPrefix.length() > maxPrefixLength - postfixLength) { try { MessageDigest digester = MessageDigest.getInstance("MD5"); - digester.update(ret.getBytes()); - ret = new String(digester.digest()) + Path.SEPARATOR; + digester.update(statsPrefix.getBytes()); + return new String(digester.digest()) + Path.SEPARATOR; // 17 byte } catch (NoSuchAlgorithmException e) { throw new RuntimeException(e); } } - return ret; + return statsPrefix.endsWith(Path.SEPARATOR) ? statsPrefix : statsPrefix + Path.SEPARATOR; } - public static String appendPathSeparator(String path) { - if (!path.endsWith(Path.SEPARATOR)) { - path = path + Path.SEPARATOR; + public static String join(String... elements) { + StringBuilder builder = new StringBuilder(); + for (String element : elements) { + if (element == null || element.isEmpty()) { + continue; + } + builder.append(element); + if (!element.endsWith(Path.SEPARATOR)) { + builder.append(Path.SEPARATOR); + } } - return path; + return builder.toString(); } public static void setColumnNameList(JobConf jobConf, Operator op) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java index 7e701f4..e319fe4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/stats/PartialScanMapper.java @@ -26,13 +26,13 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.RCFile.KeyBuffer; import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileKeyBufferWrapper; import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileValueBufferWrapper; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.ql.stats.StatsPublisher; import org.apache.hadoop.hive.shims.CombineHiveKey; import org.apache.hadoop.mapred.JobConf; @@ -142,7 +142,7 @@ private void publishStats() throws HiveException { // construct key used to store stats in intermediate db String taskID = Utilities.getTaskIdFromFilename(Utilities.getTaskId(jc)); String keyPrefix = Utilities.getHashedStatsPrefix( - statsAggKeyPrefix, HiveConf.getIntVar(jc, ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH)); + statsAggKeyPrefix, StatsFactory.getMaxPrefixLength(jc), taskID.length()); String key = keyPrefix + taskID; // construct statistics to be stored diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java index cca8481..fd811f3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java @@ -44,7 +44,6 @@ import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.io.RCFileInputFormat; import org.apache.hadoop.hive.ql.io.rcfile.merge.MergeWork; import org.apache.hadoop.hive.ql.lib.Node; @@ -65,6 +64,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.mapred.InputFormat; @@ -231,7 +231,7 @@ private void addStatsTask(FileSinkOperator nd, MoveTask mvTask, } assert statsWork != null : "Error when genereting StatsTask"; - statsWork.setSourceTask((MapRedTask)currTask); + statsWork.setSourceTask(currTask); statsWork.setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE)); MapredWork mrWork = (MapredWork) currTask.getWork(); @@ -247,8 +247,7 @@ private void addStatsTask(FileSinkOperator nd, MoveTask mvTask, mrWork.getReduceWork().setGatheringStats(true); } nd.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE)); - nd.getConf().setMaxStatsKeyPrefixLength( - hconf.getIntVar(ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH)); + nd.getConf().setMaxStatsKeyPrefixLength(StatsFactory.getMaxPrefixLength(hconf)); // mrWork.addDestinationTable(nd.getConf().getTableInfo().getTableName()); // subscribe feeds from the MoveTask so that MoveTask can forward the list diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java index af729e6..8d3dc56 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java @@ -75,7 +75,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, // create a dummy MapReduce task MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx); - Task currTask = TaskFactory.get(currWork, parseCtx.getConf()); + MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf()); Operator currTopOp = op; ctx.setCurrTask(currTask); ctx.setCurrTopOp(currTopOp); @@ -96,7 +96,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, StatsWork statsWork = new StatsWork(parseCtx.getQB().getParseInfo().getTableSpec()); statsWork.setAggKey(op.getConf().getStatsAggPrefix()); - statsWork.setSourceTask((MapRedTask)currTask); + statsWork.setSourceTask(currTask); statsWork.setStatsReliable( parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); Task statsTask = TaskFactory.get(statsWork, parseCtx.getConf()); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java index 7443ea4..6b08bb1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java @@ -1009,6 +1009,7 @@ private void analyzeTruncateTable(ASTNode ast) throws SemanticException { statDesc = new StatsWork(ltd); } statDesc.setNoStatsAggregator(true); + statDesc.setClearAggregatorStats(true); statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); Task statTask = TaskFactory.get(statDesc, conf); moveTsk.addDependentTask(statTask); @@ -1622,6 +1623,7 @@ private void analyzeAlterTablePartMergeFiles(ASTNode tablePartAST, ASTNode ast, statDesc = new StatsWork(ltd); } statDesc.setNoStatsAggregator(true); + statDesc.setClearAggregatorStats(true); statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); Task statTask = TaskFactory.get(statDesc, conf); moveTsk.addDependentTask(statTask); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index d0a0ec7..f799689 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -155,6 +155,7 @@ import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.ResourceType; +import org.apache.hadoop.hive.ql.stats.StatsFactory; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash; @@ -8510,8 +8511,7 @@ private void setupStats(TableScanDesc tsDesc, QBParseInfo qbp, Table tab, String } else { tsDesc.setGatherStats(true); tsDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - tsDesc.setMaxStatsKeyPrefixLength( - conf.getIntVar(HiveConf.ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH)); + tsDesc.setMaxStatsKeyPrefixLength(StatsFactory.getMaxPrefixLength(conf)); // append additional virtual columns for storing statistics Iterator vcs = VirtualColumn.getStatsRegistry(conf).iterator(); diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java index 0dd0b03..0f0e825 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java @@ -20,7 +20,7 @@ import java.io.Serializable; -import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; +import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec; /** @@ -50,7 +50,7 @@ private boolean isPartialScanAnalyzeCommand = false; - private transient MapRedTask sourceTask; + private transient Task sourceTask; public StatsWork() { } @@ -144,11 +144,11 @@ public void setPartialScanAnalyzeCommand(boolean isPartialScanAnalyzeCommand) { this.isPartialScanAnalyzeCommand = isPartialScanAnalyzeCommand; } - public MapRedTask getSourceTask() { + public Task getSourceTask() { return sourceTask; } - public void setSourceTask(MapRedTask sourceTask) { + public void setSourceTask(Task sourceTask) { this.sourceTask = sourceTask; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/CounterStatsAggregator.java ql/src/java/org/apache/hadoop/hive/ql/stats/CounterStatsAggregator.java index fa430eb..bf3ca1d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/CounterStatsAggregator.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/CounterStatsAggregator.java @@ -19,15 +19,14 @@ package org.apache.hadoop.hive.ql.stats; import java.io.IOException; -import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.mr.ExecDriver; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.mapred.Counters; -import org.apache.hadoop.mapred.Counters.Counter; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RunningJob; @@ -40,7 +39,7 @@ private JobClient jc; @Override - public boolean connect(Configuration hconf, MapRedTask sourceTask) { + public boolean connect(Configuration hconf, Task sourceTask) { try { jc = new JobClient(toJobConf(hconf)); RunningJob job = jc.getJob(sourceTask.getJobID()); diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsAggregator.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsAggregator.java index 661d648..57aa4c9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsAggregator.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsAggregator.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.stats; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; /** @@ -35,7 +36,7 @@ * @param sourceTask * @return true if connection is successful, false otherwise. */ - public boolean connect(Configuration hconf, MapRedTask sourceTask); + public boolean connect(Configuration hconf, Task sourceTask); /** * This method aggregates a given statistic from all tasks (partial stats). diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java index 8ae32f0..2fb880d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsFactory.java @@ -28,6 +28,9 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.util.ReflectionUtils; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVESTATSDBCLASS; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH; + /** * A factory of stats publisher and aggregator implementations of the * StatsPublisher and StatsAggregator interfaces. @@ -40,8 +43,19 @@ private Class aggregatorImplementation; private Configuration jobConf; + public static int getMaxPrefixLength(Configuration conf) { + int maxPrefixLength = HiveConf.getIntVar(conf, HIVE_STATS_KEY_PREFIX_MAX_LENGTH); + if (HiveConf.getVar(conf, HIVESTATSDBCLASS).equalsIgnoreCase(StatDB.counter.name())) { + // see org.apache.hadoop.mapred.Counter or org.apache.hadoop.mapreduce.MRJobConfig + int groupNameMax = conf.getInt("mapreduce.job.counters.group.name.max", 128); + maxPrefixLength = maxPrefixLength < 0 ? groupNameMax : + Math.min(maxPrefixLength, groupNameMax); + } + return maxPrefixLength; + } + public static StatsFactory newFactory(Configuration conf) { - return newFactory(HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS), conf); + return newFactory(HiveConf.getVar(conf, HIVESTATSDBCLASS), conf); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsAggregator.java ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsAggregator.java index fb5f50e..5fb15fc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsAggregator.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsAggregator.java @@ -32,6 +32,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.stats.StatsAggregator; @@ -55,7 +56,7 @@ public JDBCStatsAggregator() { } @Override - public boolean connect(Configuration hiveconf, MapRedTask sourceTask) { + public boolean connect(Configuration hiveconf, Task sourceTask) { this.hiveconf = hiveconf; timeout = HiveConf.getIntVar(hiveconf, HiveConf.ConfVars.HIVE_STATS_JDBC_TIMEOUT); connectionString = HiveConf.getVar(hiveconf, HiveConf.ConfVars.HIVESTATSDBCONNECTIONSTRING); diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsPublisher.java ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsPublisher.java index 8c23b87..606ccb3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsPublisher.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/jdbc/JDBCStatsPublisher.java @@ -111,6 +111,7 @@ public Void run(PreparedStatement stmt) throws SQLException { } catch (SQLException e) { // for SQLTransientException (maxRetries already achieved at Utilities retry functions // or SQLNonTransientException, declare a real failure + LOG.error("Error during JDBC connection to " + connectionString + ". ", e); return false; } } diff --git ql/src/test/queries/clientpositive/stats_counter.q ql/src/test/queries/clientpositive/stats_counter.q index 20769e4..3c1f132 100644 --- ql/src/test/queries/clientpositive/stats_counter.q +++ ql/src/test/queries/clientpositive/stats_counter.q @@ -1,6 +1,16 @@ -set hive.stats.autogather=true; set hive.stats.dbclass=counter; +set hive.stats.autogather=false; + +-- by analyze +create table dummy1 as select * from src; + +analyze table dummy1 compute statistics; +desc formatted dummy1; + +set hive.stats.dbclass=counter; +set hive.stats.autogather=true; -create table dummy as select * from src; +-- by autogather +create table dummy2 as select * from src; -desc formatted dummy; +desc formatted dummy2; diff --git ql/src/test/queries/clientpositive/stats_noscan_2.q ql/src/test/queries/clientpositive/stats_noscan_2.q index a19d01b..b106b30 100644 --- ql/src/test/queries/clientpositive/stats_noscan_2.q +++ ql/src/test/queries/clientpositive/stats_noscan_2.q @@ -3,10 +3,10 @@ -- 1 test table CREATE EXTERNAL TABLE anaylyze_external (a INT) LOCATION '${system:hive.root}/data/files/ext_test'; SELECT * FROM anaylyze_external; -analyze table anaylyze_external compute statistics; -describe formatted anaylyze_external; analyze table anaylyze_external compute statistics noscan; describe formatted anaylyze_external; +analyze table anaylyze_external compute statistics; +describe formatted anaylyze_external; drop table anaylyze_external; -- 2 test partition @@ -21,10 +21,10 @@ CREATE EXTERNAL TABLE anaylyze_external (key string, val string) partitioned by ALTER TABLE anaylyze_external ADD PARTITION (insertdate='2008-01-01') location 'pfile://${system:test.tmp.dir}/texternal/2008-01-01'; select count(*) from anaylyze_external where insertdate='2008-01-01'; -- analyze -analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics; -describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics noscan; describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); +analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics; +describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); dfs -rmr ${system:test.tmp.dir}/texternal; drop table anaylyze_external; diff --git ql/src/test/results/clientpositive/stats_counter.q.out ql/src/test/results/clientpositive/stats_counter.q.out index f15d8c5..40d8656 100644 --- ql/src/test/results/clientpositive/stats_counter.q.out +++ ql/src/test/results/clientpositive/stats_counter.q.out @@ -1,13 +1,66 @@ -PREHOOK: query: create table dummy as select * from src +PREHOOK: query: -- by analyze +create table dummy1 as select * from src PREHOOK: type: CREATETABLE_AS_SELECT PREHOOK: Input: default@src -POSTHOOK: query: create table dummy as select * from src +POSTHOOK: query: -- by analyze +create table dummy1 as select * from src POSTHOOK: type: CREATETABLE_AS_SELECT POSTHOOK: Input: default@src -POSTHOOK: Output: default@dummy -PREHOOK: query: desc formatted dummy +POSTHOOK: Output: default@dummy1 +PREHOOK: query: analyze table dummy1 compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@dummy1 +PREHOOK: Output: default@dummy1 +POSTHOOK: query: analyze table dummy1 compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dummy1 +POSTHOOK: Output: default@dummy1 +PREHOOK: query: desc formatted dummy1 PREHOOK: type: DESCTABLE -POSTHOOK: query: desc formatted dummy +POSTHOOK: query: desc formatted dummy1 +POSTHOOK: type: DESCTABLE +# col_name data_type comment + +key string None +value string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 500 + rawDataSize 5312 + totalSize 5812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- by autogather +create table dummy2 as select * from src +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src +POSTHOOK: query: -- by autogather +create table dummy2 as select * from src +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src +POSTHOOK: Output: default@dummy2 +PREHOOK: query: desc formatted dummy2 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted dummy2 POSTHOOK: type: DESCTABLE # col_name data_type comment diff --git ql/src/test/results/clientpositive/stats_noscan_1.q.out ql/src/test/results/clientpositive/stats_noscan_1.q.out index 5aa6607..615a97f 100644 --- ql/src/test/results/clientpositive/stats_noscan_1.q.out +++ ql/src/test/results/clientpositive/stats_noscan_1.q.out @@ -135,8 +135,8 @@ Protect Mode: None Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows 0 - rawDataSize 0 + numRows -1 + rawDataSize -1 totalSize 5812 #### A masked pattern was here #### @@ -183,8 +183,8 @@ Protect Mode: None Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows 0 - rawDataSize 0 + numRows -1 + rawDataSize -1 totalSize 5812 #### A masked pattern was here #### @@ -510,8 +510,8 @@ Protect Mode: None Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows 0 - rawDataSize 0 + numRows -1 + rawDataSize -1 totalSize 5812 #### A masked pattern was here #### @@ -566,8 +566,8 @@ Protect Mode: None Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows 0 - rawDataSize 0 + numRows -1 + rawDataSize -1 totalSize 5812 #### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/stats_noscan_2.q.out ql/src/test/results/clientpositive/stats_noscan_2.q.out index e55fa94..b17d456 100644 --- ql/src/test/results/clientpositive/stats_noscan_2.q.out +++ ql/src/test/results/clientpositive/stats_noscan_2.q.out @@ -21,11 +21,11 @@ POSTHOOK: Input: default@anaylyze_external 4 5 6 -PREHOOK: query: analyze table anaylyze_external compute statistics +PREHOOK: query: analyze table anaylyze_external compute statistics noscan PREHOOK: type: QUERY PREHOOK: Input: default@anaylyze_external PREHOOK: Output: default@anaylyze_external -POSTHOOK: query: analyze table anaylyze_external compute statistics +POSTHOOK: query: analyze table anaylyze_external compute statistics noscan POSTHOOK: type: QUERY POSTHOOK: Input: default@anaylyze_external POSTHOOK: Output: default@anaylyze_external @@ -48,8 +48,8 @@ Table Parameters: COLUMN_STATS_ACCURATE true EXTERNAL TRUE numFiles 0 - numRows 6 - rawDataSize 6 + numRows -1 + rawDataSize -1 totalSize 0 #### A masked pattern was here #### @@ -63,11 +63,11 @@ Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 -PREHOOK: query: analyze table anaylyze_external compute statistics noscan +PREHOOK: query: analyze table anaylyze_external compute statistics PREHOOK: type: QUERY PREHOOK: Input: default@anaylyze_external PREHOOK: Output: default@anaylyze_external -POSTHOOK: query: analyze table anaylyze_external compute statistics noscan +POSTHOOK: query: analyze table anaylyze_external compute statistics POSTHOOK: type: QUERY POSTHOOK: Input: default@anaylyze_external POSTHOOK: Output: default@anaylyze_external @@ -90,8 +90,8 @@ Table Parameters: COLUMN_STATS_ACCURATE true EXTERNAL TRUE numFiles 0 - numRows 0 - rawDataSize 0 + numRows 6 + rawDataSize 6 totalSize 0 #### A masked pattern was here #### @@ -184,14 +184,14 @@ POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).key SIMPLE [(src)s POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] 500 PREHOOK: query: -- analyze -analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics +analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics noscan PREHOOK: type: QUERY PREHOOK: Input: default@anaylyze_external PREHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 PREHOOK: Output: default@anaylyze_external PREHOOK: Output: default@anaylyze_external@insertdate=2008-01-01 POSTHOOK: query: -- analyze -analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics +analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics noscan POSTHOOK: type: QUERY POSTHOOK: Input: default@anaylyze_external POSTHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 @@ -225,8 +225,8 @@ Protect Mode: None Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows 500 - rawDataSize 5312 + numRows -1 + rawDataSize -1 totalSize 5812 #### A masked pattern was here #### @@ -240,13 +240,13 @@ Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 -PREHOOK: query: analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics noscan +PREHOOK: query: analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics PREHOOK: type: QUERY PREHOOK: Input: default@anaylyze_external PREHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 PREHOOK: Output: default@anaylyze_external PREHOOK: Output: default@anaylyze_external@insertdate=2008-01-01 -POSTHOOK: query: analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics noscan +POSTHOOK: query: analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics POSTHOOK: type: QUERY POSTHOOK: Input: default@anaylyze_external POSTHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 @@ -280,8 +280,8 @@ Protect Mode: None Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows 0 - rawDataSize 0 + numRows 500 + rawDataSize 5312 totalSize 5812 #### A masked pattern was here #### diff --git ql/src/test/results/clientpositive/truncate_column.q.out ql/src/test/results/clientpositive/truncate_column.q.out index a247c4a..82f4988 100644 --- ql/src/test/results/clientpositive/truncate_column.q.out +++ ql/src/test/results/clientpositive/truncate_column.q.out @@ -108,8 +108,8 @@ Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows -1 - rawDataSize -1 + numRows 0 + rawDataSize 0 totalSize 150 #### A masked pattern was here #### @@ -194,8 +194,8 @@ Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows -1 - rawDataSize -1 + numRows 0 + rawDataSize 0 totalSize 75 #### A masked pattern was here #### @@ -270,8 +270,8 @@ Table Type: MANAGED_TABLE Table Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows -1 - rawDataSize -1 + numRows 0 + rawDataSize 0 totalSize 75 #### A masked pattern was here #### @@ -444,8 +444,8 @@ Table Parameters: COLUMN_STATS_ACCURATE true #### A masked pattern was here #### numFiles 1 - numRows -1 - rawDataSize -1 + numRows 0 + rawDataSize 0 totalSize 150 #### A masked pattern was here #### @@ -527,8 +527,8 @@ Table Parameters: COLUMN_STATS_ACCURATE true #### A masked pattern was here #### numFiles 1 - numRows -1 - rawDataSize -1 + numRows 0 + rawDataSize 0 totalSize 75 #### A masked pattern was here #### @@ -720,8 +720,8 @@ Protect Mode: None Partition Parameters: COLUMN_STATS_ACCURATE true numFiles 1 - numRows 10 - rawDataSize 94 + numRows 0 + rawDataSize 0 totalSize 150 #### A masked pattern was here ####