diff --git common/src/java/org/apache/hadoop/hive/common/StatsSetupConst.java common/src/java/org/apache/hadoop/hive/common/StatsSetupConst.java index 7c27d07024..45f1afec5d 100644 --- common/src/java/org/apache/hadoop/hive/common/StatsSetupConst.java +++ common/src/java/org/apache/hadoop/hive/common/StatsSetupConst.java @@ -257,6 +257,14 @@ public static void setColumnStatsState(Map params, List } } + public static boolean canColumnStatsMerge(Map params, String colName) { + if (params == null) { + return false; + } + ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); + return stats.columnStats.containsKey(colName); + } + public static void clearColumnStatsState(Map params) { if (params == null) { return; @@ -294,7 +302,9 @@ public static void setStatsStateForCreateTable(Map params, } } setBasicStatsState(params, setting); - setColumnStatsState(params, cols); + if (TRUE.equals(setting)) { + setColumnStatsState(params, cols); + } } private static ColumnStatsAccurate parseStatsAcc(String statsAcc) { diff --git common/src/java/org/apache/hadoop/hive/common/jsonexplain/Vertex.java common/src/java/org/apache/hadoop/hive/common/jsonexplain/Vertex.java index b7dc88c939..bcbfe977dd 100644 --- common/src/java/org/apache/hadoop/hive/common/jsonexplain/Vertex.java +++ common/src/java/org/apache/hadoop/hive/common/jsonexplain/Vertex.java @@ -249,7 +249,8 @@ public void print(Printer printer, int indentFlag, String type, Vertex callingVe // find the right op Op choose = null; for (Op op : this.outputOps) { - if (op.outputVertexName.equals(callingVertex.name)) { + // op.outputVertexName may be null + if (callingVertex.name.equals(op.outputVertexName)) { choose = op; } } diff --git itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/TestMTQueries.java itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/TestMTQueries.java index ad2baa2e26..e8ef4b97d6 100644 --- itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/TestMTQueries.java +++ itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/TestMTQueries.java @@ -44,6 +44,7 @@ public void testMTQueries1() throws Exception { util.getConf().setBoolean("hive.exec.submit.local.task.via.child", true); util.getConf().set("hive.stats.dbclass", "fs"); util.getConf().set("hive.mapred.mode", "nonstrict"); + util.getConf().set("hive.stats.column.autogather", "false"); } boolean success = QTestUtil.queryListRunnerMultiThreaded(qfiles, qts); if (!success) { diff --git itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniHS2.java itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniHS2.java index 4a9af80fdc..96173c014e 100644 --- itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniHS2.java +++ itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniHS2.java @@ -201,6 +201,7 @@ private static void startMiniHS2(HiveConf conf) throws Exception { private static void startMiniHS2(HiveConf conf, boolean httpMode) throws Exception { conf.setBoolVar(ConfVars.HIVE_SUPPORT_CONCURRENCY, false); conf.setBoolVar(ConfVars.HIVE_SERVER2_LOGGING_OPERATION_ENABLED, false); + conf.setBoolVar(ConfVars.HIVESTATSCOLAUTOGATHER, false); MiniHS2.Builder builder = new MiniHS2.Builder().withConf(conf).cleanupLocalDirOnStartup(false); if (httpMode) { builder = builder.withHTTPTransport(); diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index d472bb3f9e..7bbf2d7ad8 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -111,6 +111,9 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ auto_sortmerge_join_7.q,\ auto_sortmerge_join_8.q,\ auto_sortmerge_join_9.q,\ + autoColumnStats_1.q,\ + autoColumnStats_10.q,\ + autoColumnStats_2.q,\ bucket2.q,\ bucket3.q,\ bucket4.q,\ @@ -472,8 +475,6 @@ minillaplocal.query.files=\ auto_sortmerge_join_6.q,\ auto_sortmerge_join_8.q,\ auto_sortmerge_join_9.q,\ - autoColumnStats_1.q,\ - autoColumnStats_2.q,\ bucket4.q,\ bucket_groupby.q,\ bucket_many.q,\ diff --git metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java index b279e1d567..3dbe72fef4 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java @@ -248,7 +248,7 @@ public void alterTable(RawStore msdb, Warehouse wh, String dbname, part.setDbName(newDbName); part.setTableName(newTblName); ColumnStatistics colStats = updateOrGetPartitionColumnStats(msdb, dbname, name, - part.getValues(), part.getSd().getCols(), oldt, part); + part.getValues(), part.getSd().getCols(), oldt, part, null); if (colStats != null) { columnStatsNeedUpdated.put(part, colStats); } @@ -287,7 +287,7 @@ public void alterTable(RawStore msdb, Warehouse wh, String dbname, List oldCols = part.getSd().getCols(); part.getSd().setCols(newt.getSd().getCols()); ColumnStatistics colStats = updateOrGetPartitionColumnStats(msdb, dbname, name, - part.getValues(), oldCols, oldt, part); + part.getValues(), oldCols, oldt, part, null); assert(colStats == null); msdb.alterPartition(dbname, name, part.getValues(), part); } @@ -296,6 +296,17 @@ public void alterTable(RawStore msdb, Warehouse wh, String dbname, LOG.warn("Alter table does not cascade changes to its partitions."); } } else { + if (isPartitionedTable + && !MetaStoreUtils.areSameColumns(oldt.getSd().getCols(), newt.getSd().getCols())) { + parts = msdb.getPartitions(dbname, name, -1); + for (Partition part : parts) { + List oldCols = part.getSd().getCols(); + ColumnStatistics colStats = updateOrGetPartitionColumnStats(msdb, dbname, name, + part.getValues(), oldCols, oldt, part, newt.getSd().getCols()); + assert (colStats == null); + msdb.alterPartition(dbname, name, part.getValues(), part); + } + } alterTableUpdateTableColumnStats(msdb, oldt, newt); } } @@ -412,7 +423,7 @@ public Partition alterPartition(final RawStore msdb, Warehouse wh, final String // PartitionView does not have SD. We do not need update its column stats if (oldPart.getSd() != null) { updateOrGetPartitionColumnStats(msdb, dbname, name, new_part.getValues(), - oldPart.getSd().getCols(), tbl, new_part); + oldPart.getSd().getCols(), tbl, new_part, null); } msdb.alterPartition(dbname, name, new_part.getValues(), new_part); if (transactionalListeners != null && !transactionalListeners.isEmpty()) { @@ -539,7 +550,7 @@ public Partition alterPartition(final RawStore msdb, Warehouse wh, final String String newPartName = Warehouse.makePartName(tbl.getPartitionKeys(), new_part.getValues()); ColumnStatistics cs = updateOrGetPartitionColumnStats(msdb, dbname, name, oldPart.getValues(), - oldPart.getSd().getCols(), tbl, new_part); + oldPart.getSd().getCols(), tbl, new_part, null); msdb.alterPartition(dbname, name, part_vals, new_part); if (cs != null) { cs.getStatsDesc().setPartName(newPartName); @@ -637,7 +648,7 @@ public Partition alterPartition(final RawStore msdb, Warehouse wh, final String // PartitionView does not have SD and we do not need to update its column stats if (oldTmpPart.getSd() != null) { updateOrGetPartitionColumnStats(msdb, dbname, name, oldTmpPart.getValues(), - oldTmpPart.getSd().getCols(), tbl, tmpPart); + oldTmpPart.getSd().getCols(), tbl, tmpPart, null); } } @@ -789,12 +800,14 @@ void alterTableUpdateTableColumnStats(RawStore msdb, Table oldTable, Table newTa private ColumnStatistics updateOrGetPartitionColumnStats( RawStore msdb, String dbname, String tblname, List partVals, - List oldCols, Table table, Partition part) + List oldCols, Table table, Partition part, List newCols) throws MetaException, InvalidObjectException { ColumnStatistics newPartsColStats = null; try { - List newCols = part.getSd() == null ? - new ArrayList() : part.getSd().getCols(); + // if newCols are not specified, use default ones. + if (newCols == null) { + newCols = part.getSd() == null ? new ArrayList() : part.getSd().getCols(); + } String oldPartName = Warehouse.makePartName(table.getPartitionKeys(), partVals); String newPartName = Warehouse.makePartName(table.getPartitionKeys(), part.getValues()); boolean rename = !part.getDbName().equals(dbname) || !part.getTableName().equals(tblname) diff --git metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java index 6393c8e9e2..f8c95d418a 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java @@ -35,6 +35,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -63,6 +64,7 @@ import com.google.common.collect.ImmutableListMultimap; import com.google.common.collect.Lists; import com.google.common.collect.Multimaps; + import org.apache.commons.cli.OptionBuilder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -6964,39 +6966,80 @@ public boolean set_aggr_stats_for(SetPartitionsStatsRequest request) if (request.isSetNeedMerge() && request.isNeedMerge()) { // one single call to get all column stats ColumnStatistics csOld = getMS().getTableColumnStatistics(dbName, tableName, colNames); - if (csOld != null && csOld.getStatsObjSize() != 0) { + Table t = getTable(dbName, tableName); + // we first use t.getParameters() to prune the stats + MetaStoreUtils.pruneColumnStats(firstColStats, t.getParameters()); + // we merge those that can be merged + if (csOld != null && csOld.getStatsObjSize() != 0 + && !firstColStats.getStatsObj().isEmpty()) { MetaStoreUtils.mergeColStats(firstColStats, csOld); } + if (!firstColStats.getStatsObj().isEmpty()) { + return update_table_column_statistics(firstColStats); + } else { + LOG.debug("All the column stats are not accurate to merge."); + return true; + } + } else { + // This is the overwrite case, we do not care about the accuracy. + return update_table_column_statistics(firstColStats); } - return update_table_column_statistics(firstColStats); } } else { // partition level column stats merging - List partitionNames = new ArrayList<>(); + List partitions = new ArrayList<>(); + // note that we may have two or more duplicate partition names. + // see autoColumnStats_2.q under TestMiniLlapLocalCliDriver + Map newStatsMap = new HashMap<>(); for (ColumnStatistics csNew : csNews) { - partitionNames.add(csNew.getStatsDesc().getPartName()); + String partName = csNew.getStatsDesc().getPartName(); + if (newStatsMap.containsKey(partName)) { + MetaStoreUtils.mergeColStats(csNew, newStatsMap.get(partName)); + } + newStatsMap.put(partName, csNew); } - Map map = new HashMap<>(); + + Map oldStatsMap = new HashMap<>(); + Map mapToPart = new HashMap<>(); if (request.isSetNeedMerge() && request.isNeedMerge()) { // a single call to get all column stats for all partitions + List partitionNames = new ArrayList<>(); + partitionNames.addAll(newStatsMap.keySet()); List csOlds = getMS().getPartitionColumnStatistics(dbName, tableName, partitionNames, colNames); - if (csNews.size() != csOlds.size()) { + if (newStatsMap.values().size() != csOlds.size()) { // some of the partitions miss stats. LOG.debug("Some of the partitions miss stats."); } for (ColumnStatistics csOld : csOlds) { - map.put(csOld.getStatsDesc().getPartName(), csOld); + oldStatsMap.put(csOld.getStatsDesc().getPartName(), csOld); + } + // another single call to get all the partition objects + partitions = getMS().getPartitionsByNames(dbName, tableName, partitionNames); + for (int index = 0; index < partitionNames.size(); index++) { + mapToPart.put(partitionNames.get(index), partitions.get(index)); } } Table t = getTable(dbName, tableName); - for (int index = 0; index < csNews.size(); index++) { - ColumnStatistics csNew = csNews.get(index); - ColumnStatistics csOld = map.get(csNew.getStatsDesc().getPartName()); - if (csOld != null && csOld.getStatsObjSize() != 0) { - MetaStoreUtils.mergeColStats(csNew, csOld); + for (Entry entry : newStatsMap.entrySet()) { + ColumnStatistics csNew = entry.getValue(); + ColumnStatistics csOld = oldStatsMap.get(entry.getKey()); + if (request.isSetNeedMerge() && request.isNeedMerge()) { + // we first use getParameters() to prune the stats + MetaStoreUtils.pruneColumnStats(csNew, mapToPart.get(entry.getKey()).getParameters()); + // we merge those that can be merged + if (csOld != null && csOld.getStatsObjSize() != 0 && !csNew.getStatsObj().isEmpty()) { + MetaStoreUtils.mergeColStats(csNew, csOld); + } + if (!csNew.getStatsObj().isEmpty()) { + ret = ret && updatePartitonColStats(t, csNew); + } else { + LOG.debug("All the column stats " + csNew.getStatsDesc().getPartName() + + " are not accurate to merge."); + } + } else { + ret = ret && updatePartitonColStats(t, csNew); } - ret = ret && updatePartitonColStats(t, csNew); } } return ret; diff --git metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index bbe13fd77b..f2cd1d7895 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -1903,6 +1903,19 @@ public static String encodeTableName(String name) { return sb.toString(); } + public static void pruneColumnStats(ColumnStatistics csNew, Map parameters) { + List list = new ArrayList<>(); + for (int index = 0; index < csNew.getStatsObj().size(); index++) { + ColumnStatisticsObj statsObjNew = csNew.getStatsObj().get(index); + // canColumnStatsMerge guarantees that it is accurate before we do merge + if (StatsSetupConst.canColumnStatsMerge(parameters, statsObjNew.getColName())) { + list.add(statsObjNew); + } + // in all the other cases, we can not merge + } + csNew.setStatsObj(list); + } + // this function will merge csOld into csNew. public static void mergeColStats(ColumnStatistics csNew, ColumnStatistics csOld) throws InvalidObjectException { @@ -1926,13 +1939,20 @@ public static void mergeColStats(ColumnStatistics csNew, ColumnStatistics csOld) ColumnStatisticsObj statsObjNew = csNew.getStatsObj().get(index); ColumnStatisticsObj statsObjOld = map.get(statsObjNew.getColName()); if (statsObjOld != null) { + // because we already confirm that the stats is accurate + // it is impossible that the column types have been changed while the + // column stats is still accurate. + assert (statsObjNew.getStatsData().getSetField() == statsObjOld.getStatsData() + .getSetField()); // If statsObjOld is found, we can merge. ColumnStatsMerger merger = ColumnStatsMergerFactory.getColumnStatsMerger(statsObjNew, statsObjOld); merger.merge(statsObjNew, statsObjOld); } + // If statsObjOld is not found, we just use statsObjNew as it is accurate. list.add(statsObjNew); } + // in all the other cases, we can not merge csNew.setStatsObj(list); } diff --git metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java index 3053dcb50b..baeee8be71 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java @@ -7081,7 +7081,7 @@ public boolean updateTableColumnStatistics(ColumnStatistics colStats) MTableColumnStatistics mStatsObj = StatObjectConverter.convertToMTableColumnStatistics( ensureGetMTable(statsDesc.getDbName(), statsDesc.getTableName()), statsDesc, statsObj); writeMTableColumnStatistics(table, mStatsObj, oldStats.get(statsObj.getColName())); - colNames.add(statsObj.getColName()); + // There is no need to add colname again, otherwise we will get duplicate colNames. } // Set the table properties diff --git metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMergerFactory.java metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMergerFactory.java index 66be524139..0dc2aa2fee 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMergerFactory.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/merge/ColumnStatsMergerFactory.java @@ -70,7 +70,7 @@ public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsOb break; } default: - throw new IllegalArgumentException("Unknown stats type " + typeNew.toString()); + throw new IllegalArgumentException("Unknown stats type " + statsObjNew.getStatsData().getSetField()); } return agg; } diff --git ql/src/java/org/apache/hadoop/hive/ql/DriverContext.java ql/src/java/org/apache/hadoop/hive/ql/DriverContext.java index f43992c85d..eb94b939f1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/DriverContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/DriverContext.java @@ -18,11 +18,12 @@ package org.apache.hadoop.hive.ql; +import org.apache.hadoop.hive.ql.exec.StatsTask; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.NodeUtils; import org.apache.hadoop.hive.ql.exec.NodeUtils.Function; import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.StatsTask; +import org.apache.hadoop.hive.ql.exec.BasicStatsTask; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskRunner; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; @@ -64,7 +65,7 @@ private Context ctx; private boolean shutdown; - final Map statsTasks = new HashMap(1); + final Map statsTasks = new HashMap<>(1); public DriverContext() { } @@ -191,7 +192,9 @@ public void prepare(QueryPlan plan) { NodeUtils.iterateTask(rootTasks, StatsTask.class, new Function() { @Override public void apply(StatsTask statsTask) { - statsTasks.put(statsTask.getWork().getAggKey(), statsTask); + if(statsTask.getWork().getBasicStatsWork()!=null) { + statsTasks.put(statsTask.getWork().getBasicStatsWork().getAggKey(), statsTask); + } } }); } @@ -221,7 +224,7 @@ public void apply(FileSinkOperator fsOp) { } }); for (String statKey : statKeys) { - statsTasks.get(statKey).getWork().setSourceTask(mapredTask); + statsTasks.get(statKey).getWork().getBasicStatsWork().setSourceTask(mapredTask); } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/BasicStatsNoJobTask.java similarity index 98% rename from ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java rename to ql/src/java/org/apache/hadoop/hive/ql/exec/BasicStatsNoJobTask.java index 9c3a664b9a..beb68f2703 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/StatsNoJobTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/BasicStatsNoJobTask.java @@ -48,7 +48,7 @@ import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; -import org.apache.hadoop.hive.ql.plan.StatsNoJobWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork; import org.apache.hadoop.hive.ql.plan.api.StageType; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputFormat; @@ -71,16 +71,16 @@ * rows. This task can be used for computing basic stats like numFiles, numRows, fileSize, * rawDataSize from ORC footer. **/ -public class StatsNoJobTask extends Task implements Serializable { +public class BasicStatsNoJobTask extends Task implements Serializable { private static final long serialVersionUID = 1L; - private static transient final Logger LOG = LoggerFactory.getLogger(StatsNoJobTask.class); + private static transient final Logger LOG = LoggerFactory.getLogger(BasicStatsNoJobTask.class); private ConcurrentMap partUpdates; private Table table; private String tableFullName; private JobConf jc = null; - public StatsNoJobTask() { + public BasicStatsNoJobTask() { super(); } @@ -141,7 +141,6 @@ public void run() { // get the list of partitions org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); Map parameters = tPart.getParameters(); - try { Path dir = new Path(tPart.getSd().getLocation()); long numRows = 0; @@ -174,6 +173,7 @@ public void run() { } if (statsAvailable) { + parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows)); parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize)); parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize)); @@ -280,7 +280,6 @@ private int aggregateStats(ExecutorService threadPool, Hive db) { parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles)); EnvironmentContext environmentContext = new EnvironmentContext(); environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK); - db.alterTable(tableFullName, new Table(tTable), environmentContext); String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']'; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/BasicStatsTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/BasicStatsTask.java new file mode 100644 index 0000000000..d3119040bd --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/BasicStatsTask.java @@ -0,0 +1,519 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.hive.ql.exec; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.EnvironmentContext; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.ql.DriverContext; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; +import org.apache.hadoop.hive.ql.parse.ExplainConfiguration.AnalyzeState; +import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; +import org.apache.hadoop.hive.ql.plan.LoadTableDesc; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; +import org.apache.hadoop.hive.ql.plan.api.StageType; +import org.apache.hadoop.hive.ql.stats.StatsAggregator; +import org.apache.hadoop.hive.ql.stats.StatsCollectionContext; +import org.apache.hadoop.hive.ql.stats.StatsFactory; +import org.apache.hadoop.hive.ql.stats.StatsPublisher; +import org.apache.hadoop.util.StringUtils; + +import com.google.common.collect.Lists; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * StatsTask implementation. StatsTask mainly deals with "collectable" stats. These are + * stats that require data scanning and are collected during query execution (unless the user + * explicitly requests data scanning just for the purpose of stats computation using the "ANALYZE" + * command. All other stats are computed directly by the MetaStore. The rationale being that the + * MetaStore layer covers all Thrift calls and provides better guarantees about the accuracy of + * those stats. + **/ +public class BasicStatsTask extends Task implements Serializable { + + private static final long serialVersionUID = 1L; + private static transient final Logger LOG = LoggerFactory.getLogger(BasicStatsTask.class); + + private Table table; + private Collection dpPartSpecs; + + public BasicStatsTask() { + super(); + dpPartSpecs = null; + } + + @Override + public int execute(DriverContext driverContext) { + if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) { + return 0; + } + LOG.info("Executing stats task"); + // Make sure that it is either an ANALYZE, INSERT OVERWRITE (maybe load) or CTAS command + short workComponentsPresent = 0; + if (work.getLoadTableDesc() != null) { + workComponentsPresent++; + } + if (work.getTableSpecs() != null) { + workComponentsPresent++; + } + if (work.getLoadFileDesc() != null) { + workComponentsPresent++; + } + + assert (workComponentsPresent == 1); + + String tableName = ""; + Hive hive = getHive(); + try { + if (work.getLoadTableDesc() != null) { + tableName = work.getLoadTableDesc().getTable().getTableName(); + } else if (work.getTableSpecs() != null){ + tableName = work.getTableSpecs().tableName; + } else { + tableName = work.getLoadFileDesc().getDestinationCreateTable(); + } + + table = hive.getTable(tableName); + + } catch (HiveException e) { + LOG.error("Cannot get table " + tableName, e); + console.printError("Cannot get table " + tableName, e.toString()); + } + + return aggregateStats(hive); + + } + + @Override + public StageType getType() { + return StageType.STATS; + } + + @Override + public String getName() { + return "STATS"; + } + + private int aggregateStats(Hive db) { + + StatsAggregator statsAggregator = null; + int ret = 0; + StatsCollectionContext scc = null; + EnvironmentContext environmentContext = null; + try { + // Stats setup: + final Warehouse wh = new Warehouse(conf); + if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) { + try { + scc = getContext(); + statsAggregator = createStatsAggregator(scc, conf); + } catch (HiveException e) { + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) { + throw e; + } + console.printError(ErrorMsg.STATS_SKIPPING_BY_ERROR.getErrorCodedMsg(e.toString())); + } + } + + List partitions = getPartitionsList(db); + boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC); + + String tableFullName = table.getDbName() + "." + table.getTableName(); + + if (partitions == null) { + org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); + Map parameters = tTable.getParameters(); + // In the following scenarios, we need to reset the stats to true. + // work.getTableSpecs() != null means analyze command + // work.getLoadTableDesc().getReplace() is true means insert overwrite command + // work.getLoadFileDesc().getDestinationCreateTable().isEmpty() means CTAS etc. + // acidTable will not have accurate stats unless it is set through analyze command. + if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) { + StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); + } else if (work.getTableSpecs() != null + || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) + || (work.getLoadFileDesc() != null && !work.getLoadFileDesc() + .getDestinationCreateTable().isEmpty())) { + StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE); + } + // work.getTableSpecs() == null means it is not analyze command + // and then if it is not followed by column stats, we should clean + // column stats + if (work.getTableSpecs() == null && !work.isFollowedByColStats()) { + StatsSetupConst.clearColumnStatsState(parameters); + } + // non-partitioned tables: + if (!existStats(parameters) && atomic) { + return 0; + } + + // The collectable stats for the aggregator needs to be cleared. + // For eg. if a file is being loaded, the old number of rows are not valid + if (work.isClearAggregatorStats()) { + // we choose to keep the invalid stats and only change the setting. + StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); + } + + updateQuickStats(wh, parameters, tTable.getSd()); + if (StatsSetupConst.areBasicStatsUptoDate(parameters)) { + if (statsAggregator != null) { + String prefix = getAggregationPrefix(table, null); + updateStats(statsAggregator, parameters, prefix, atomic); + } + // write table stats to metastore + if (!getWork().getNoStatsAggregator()) { + environmentContext = new EnvironmentContext(); + environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, + StatsSetupConst.TASK); + } + } + + getHive().alterTable(tableFullName, new Table(tTable), environmentContext); + if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) { + console.printInfo("Table " + tableFullName + " stats: [" + toString(parameters) + ']'); + } + LOG.info("Table " + tableFullName + " stats: [" + toString(parameters) + ']'); + } else { + // Partitioned table: + // Need to get the old stats of the partition + // and update the table stats based on the old and new stats. + List updates = new ArrayList(); + + //Get the file status up-front for all partitions. Beneficial in cases of blob storage systems + final Map fileStatusMap = new ConcurrentHashMap(); + int poolSize = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 1); + // In case thread count is set to 0, use single thread. + poolSize = Math.max(poolSize, 1); + final ExecutorService pool = Executors.newFixedThreadPool(poolSize, + new ThreadFactoryBuilder().setDaemon(true) + .setNameFormat("stats-updater-thread-%d") + .build()); + final List> futures = Lists.newLinkedList(); + LOG.debug("Getting file stats of all partitions. threadpool size:" + poolSize); + try { + for(final Partition partn : partitions) { + final String partitionName = partn.getName(); + final org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); + Map parameters = tPart.getParameters(); + + if (!existStats(parameters) && atomic) { + continue; + } + futures.add(pool.submit(new Callable() { + @Override + public Void call() throws Exception { + FileStatus[] partfileStatus = wh.getFileStatusesForSD(tPart.getSd()); + fileStatusMap.put(partitionName, partfileStatus); + return null; + } + })); + } + pool.shutdown(); + for(Future future : futures) { + future.get(); + } + } catch (InterruptedException e) { + LOG.debug("Cancelling " + futures.size() + " file stats lookup tasks"); + //cancel other futures + for (Future future : futures) { + future.cancel(true); + } + // Fail the query if the stats are supposed to be reliable + if (work.isStatsReliable()) { + ret = 1; + } + } finally { + if (pool != null) { + pool.shutdownNow(); + } + LOG.debug("Finished getting file stats of all partitions"); + } + + for (Partition partn : partitions) { + // + // get the old partition stats + // + org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); + Map parameters = tPart.getParameters(); + if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) { + StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); + } else if (work.getTableSpecs() != null + || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) + || (work.getLoadFileDesc() != null && !work.getLoadFileDesc() + .getDestinationCreateTable().isEmpty())) { + StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE); + } + // work.getTableSpecs() == null means it is not analyze command + // and then if it is not followed by column stats, we should clean + // column stats + if (work.getTableSpecs() == null && !work.isFollowedByColStats()) { + StatsSetupConst.clearColumnStatsState(parameters); + } + //only when the stats exist, it is added to fileStatusMap + if (!fileStatusMap.containsKey(partn.getName())) { + continue; + } + + // The collectable stats for the aggregator needs to be cleared. + // For eg. if a file is being loaded, the old number of rows are not valid + if (work.isClearAggregatorStats()) { + // we choose to keep the invalid stats and only change the setting. + StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); + } + + updateQuickStats(parameters, fileStatusMap.get(partn.getName())); + if (StatsSetupConst.areBasicStatsUptoDate(parameters)) { + if (statsAggregator != null) { + String prefix = getAggregationPrefix(table, partn); + updateStats(statsAggregator, parameters, prefix, atomic); + } + if (!getWork().getNoStatsAggregator()) { + environmentContext = new EnvironmentContext(); + environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, + StatsSetupConst.TASK); + } + } + updates.add(new Partition(table, tPart)); + + if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) { + console.printInfo("Partition " + tableFullName + partn.getSpec() + + " stats: [" + toString(parameters) + ']'); + } + LOG.info("Partition " + tableFullName + partn.getSpec() + + " stats: [" + toString(parameters) + ']'); + } + if (!updates.isEmpty()) { + db.alterPartitions(tableFullName, updates, environmentContext); + } + } + + } catch (Exception e) { + console.printInfo("[Warning] could not update stats.", + "Failed with exception " + e.getMessage() + "\n" + + StringUtils.stringifyException(e)); + + // Fail the query if the stats are supposed to be reliable + if (work.isStatsReliable()) { + ret = 1; + } + } finally { + if (statsAggregator != null) { + statsAggregator.closeConnection(scc); + } + } + // The return value of 0 indicates success, + // anything else indicates failure + return ret; + } + + private String getAggregationPrefix(Table table, Partition partition) + throws MetaException { + + // prefix is of the form dbName.tblName + String prefix = table.getDbName() + "." + MetaStoreUtils.encodeTableName(table.getTableName()); + if (partition != null) { + return Utilities.join(prefix, Warehouse.makePartPath(partition.getSpec())); + } + return prefix; + } + + private StatsAggregator createStatsAggregator(StatsCollectionContext scc, HiveConf conf) throws HiveException { + String statsImpl = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS); + StatsFactory factory = StatsFactory.newFactory(statsImpl, conf); + if (factory == null) { + throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg()); + } + // initialize stats publishing table for noscan which has only stats task + // the rest of MR task following stats task initializes it in ExecDriver.java + StatsPublisher statsPublisher = factory.getStatsPublisher(); + if (!statsPublisher.init(scc)) { // creating stats table if not exists + throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg()); + } + + // manufacture a StatsAggregator + StatsAggregator statsAggregator = factory.getStatsAggregator(); + if (!statsAggregator.connect(scc)) { + throw new HiveException(ErrorMsg.STATSAGGREGATOR_CONNECTION_ERROR.getErrorCodedMsg(statsImpl)); + } + return statsAggregator; + } + + private StatsCollectionContext getContext() throws HiveException { + + StatsCollectionContext scc = new StatsCollectionContext(conf); + Task sourceTask = getWork().getSourceTask(); + if (sourceTask == null) { + throw new HiveException(ErrorMsg.STATSAGGREGATOR_SOURCETASK_NULL.getErrorCodedMsg()); + } + scc.setTask(sourceTask); + scc.setStatsTmpDir(this.getWork().getStatsTmpDir()); + return scc; + } + + private boolean existStats(Map parameters) { + return parameters.containsKey(StatsSetupConst.ROW_COUNT) + || parameters.containsKey(StatsSetupConst.NUM_FILES) + || parameters.containsKey(StatsSetupConst.TOTAL_SIZE) + || parameters.containsKey(StatsSetupConst.RAW_DATA_SIZE) + || parameters.containsKey(StatsSetupConst.NUM_PARTITIONS); + } + + private void updateStats(StatsAggregator statsAggregator, + Map parameters, String prefix, boolean atomic) + throws HiveException { + + String aggKey = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR; + + for (String statType : StatsSetupConst.statsRequireCompute) { + String value = statsAggregator.aggregateStats(aggKey, statType); + if (value != null && !value.isEmpty()) { + long longValue = Long.parseLong(value); + + if (work.getLoadTableDesc() != null && + !work.getLoadTableDesc().getReplace()) { + String originalValue = parameters.get(statType); + if (originalValue != null) { + longValue += Long.parseLong(originalValue); // todo: invalid + valid = invalid + } + } + parameters.put(statType, String.valueOf(longValue)); + } else { + if (atomic) { + throw new HiveException(ErrorMsg.STATSAGGREGATOR_MISSED_SOMESTATS, statType); + } + } + } + } + + private void updateQuickStats(Warehouse wh, Map parameters, + StorageDescriptor desc) throws MetaException { + /** + * calculate fast statistics + */ + FileStatus[] partfileStatus = wh.getFileStatusesForSD(desc); + updateQuickStats(parameters, partfileStatus); + } + + private void updateQuickStats(Map parameters, + FileStatus[] partfileStatus) throws MetaException { + MetaStoreUtils.populateQuickStats(partfileStatus, parameters); + } + + private String toString(Map parameters) { + StringBuilder builder = new StringBuilder(); + for (String statType : StatsSetupConst.supportedStats) { + String value = parameters.get(statType); + if (value != null) { + if (builder.length() > 0) { + builder.append(", "); + } + builder.append(statType).append('=').append(value); + } + } + return builder.toString(); + } + + /** + * Get the list of partitions that need to update statistics. + * TODO: we should reuse the Partitions generated at compile time + * since getting the list of partitions is quite expensive. + * + * @return a list of partitions that need to update statistics. + * @throws HiveException + */ + private List getPartitionsList(Hive db) throws HiveException { + if (work.getLoadFileDesc() != null) { + return null; //we are in CTAS, so we know there are no partitions + } + + List list = new ArrayList(); + + if (work.getTableSpecs() != null) { + + // ANALYZE command + TableSpec tblSpec = work.getTableSpecs(); + table = tblSpec.tableHandle; + if (!table.isPartitioned()) { + return null; + } + // get all partitions that matches with the partition spec + List partitions = tblSpec.partitions; + if (partitions != null) { + for (Partition partn : partitions) { + list.add(partn); + } + } + } else if (work.getLoadTableDesc() != null) { + + // INSERT OVERWRITE command + LoadTableDesc tbd = work.getLoadTableDesc(); + table = db.getTable(tbd.getTable().getTableName()); + if (!table.isPartitioned()) { + return null; + } + DynamicPartitionCtx dpCtx = tbd.getDPCtx(); + if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions + // If no dynamic partitions are generated, dpPartSpecs may not be initialized + if (dpPartSpecs != null) { + // load the list of DP partitions and return the list of partition specs + list.addAll(dpPartSpecs); + } + } else { // static partition + Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false); + list.add(partn); + } + } + return list; + } + + public Collection getDpPartSpecs() { + return dpPartSpecs; + } + + public void setDpPartSpecs(Collection dpPartSpecs) { + this.dpPartSpecs = dpPartSpecs; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsTask.java deleted file mode 100644 index 2b2c004fea..0000000000 --- ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsTask.java +++ /dev/null @@ -1,452 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.exec; - -import java.io.IOException; -import java.io.Serializable; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.metastore.Warehouse; -import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; -import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatistics; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.Date; -import org.apache.hadoop.hive.metastore.api.Decimal; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest; -import org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector; -import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.DriverContext; -import org.apache.hadoop.hive.ql.QueryPlan; -import org.apache.hadoop.hive.ql.QueryState; -import org.apache.hadoop.hive.ql.metadata.Hive; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hadoop.hive.ql.parse.ExplainConfiguration.AnalyzeState; -import org.apache.hadoop.hive.ql.plan.ColumnStatsWork; -import org.apache.hadoop.hive.ql.plan.api.StageType; -import org.apache.hadoop.hive.ql.session.SessionState; -import org.apache.hadoop.hive.serde2.io.DateWritable; -import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.util.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * ColumnStatsTask implementation. - **/ - -public class ColumnStatsTask extends Task implements Serializable { - private static final long serialVersionUID = 1L; - private FetchOperator ftOp; - private static transient final Logger LOG = LoggerFactory.getLogger(ColumnStatsTask.class); - - public ColumnStatsTask() { - super(); - } - - @Override - public void initialize(QueryState queryState, QueryPlan queryPlan, DriverContext ctx, - CompilationOpContext opContext) { - super.initialize(queryState, queryPlan, ctx, opContext); - work.initializeForFetch(opContext); - try { - JobConf job = new JobConf(conf); - ftOp = new FetchOperator(work.getfWork(), job); - } catch (Exception e) { - LOG.error(StringUtils.stringifyException(e)); - throw new RuntimeException(e); - } - } - - private void unpackBooleanStats(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj statsObj) { - long v = ((LongObjectInspector) oi).get(o); - if (fName.equals("counttrues")) { - statsObj.getStatsData().getBooleanStats().setNumTrues(v); - } else if (fName.equals("countfalses")) { - statsObj.getStatsData().getBooleanStats().setNumFalses(v); - } else if (fName.equals("countnulls")) { - statsObj.getStatsData().getBooleanStats().setNumNulls(v); - } - } - - @SuppressWarnings("serial") - class UnsupportedDoubleException extends Exception { - } - - private void unpackDoubleStats(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj statsObj) throws UnsupportedDoubleException { - if (fName.equals("countnulls")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getDoubleStats().setNumNulls(v); - } else if (fName.equals("numdistinctvalues")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getDoubleStats().setNumDVs(v); - } else if (fName.equals("max")) { - double d = ((DoubleObjectInspector) oi).get(o); - if (Double.isInfinite(d) || Double.isNaN(d)) { - throw new UnsupportedDoubleException(); - } - statsObj.getStatsData().getDoubleStats().setHighValue(d); - } else if (fName.equals("min")) { - double d = ((DoubleObjectInspector) oi).get(o); - if (Double.isInfinite(d) || Double.isNaN(d)) { - throw new UnsupportedDoubleException(); - } - statsObj.getStatsData().getDoubleStats().setLowValue(d); - } else if (fName.equals("ndvbitvector")) { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; - byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); - statsObj.getStatsData().getDoubleStats().setBitVectors(buf); - } - } - - private void unpackDecimalStats(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj statsObj) { - if (fName.equals("countnulls")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getDecimalStats().setNumNulls(v); - } else if (fName.equals("numdistinctvalues")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getDecimalStats().setNumDVs(v); - } else if (fName.equals("max")) { - HiveDecimal d = ((HiveDecimalObjectInspector) oi).getPrimitiveJavaObject(o); - statsObj.getStatsData().getDecimalStats().setHighValue(convertToThriftDecimal(d)); - } else if (fName.equals("min")) { - HiveDecimal d = ((HiveDecimalObjectInspector) oi).getPrimitiveJavaObject(o); - statsObj.getStatsData().getDecimalStats().setLowValue(convertToThriftDecimal(d)); - } else if (fName.equals("ndvbitvector")) { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; - byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); - statsObj.getStatsData().getDecimalStats().setBitVectors(buf); - } - } - - private Decimal convertToThriftDecimal(HiveDecimal d) { - return new Decimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short)d.scale()); - } - - private void unpackLongStats(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj statsObj) { - if (fName.equals("countnulls")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getLongStats().setNumNulls(v); - } else if (fName.equals("numdistinctvalues")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getLongStats().setNumDVs(v); - } else if (fName.equals("max")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getLongStats().setHighValue(v); - } else if (fName.equals("min")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getLongStats().setLowValue(v); - } else if (fName.equals("ndvbitvector")) { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; - byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); - statsObj.getStatsData().getLongStats().setBitVectors(buf); - } - } - - private void unpackStringStats(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj statsObj) { - if (fName.equals("countnulls")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getStringStats().setNumNulls(v); - } else if (fName.equals("numdistinctvalues")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getStringStats().setNumDVs(v); - } else if (fName.equals("avglength")) { - double d = ((DoubleObjectInspector) oi).get(o); - statsObj.getStatsData().getStringStats().setAvgColLen(d); - } else if (fName.equals("maxlength")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getStringStats().setMaxColLen(v); - } else if (fName.equals("ndvbitvector")) { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; - byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); - statsObj.getStatsData().getStringStats().setBitVectors(buf); - } - } - - private void unpackBinaryStats(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj statsObj) { - if (fName.equals("countnulls")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getBinaryStats().setNumNulls(v); - } else if (fName.equals("avglength")) { - double d = ((DoubleObjectInspector) oi).get(o); - statsObj.getStatsData().getBinaryStats().setAvgColLen(d); - } else if (fName.equals("maxlength")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getBinaryStats().setMaxColLen(v); - } - } - - private void unpackDateStats(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj statsObj) { - if (fName.equals("countnulls")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getDateStats().setNumNulls(v); - } else if (fName.equals("numdistinctvalues")) { - long v = ((LongObjectInspector) oi).get(o); - statsObj.getStatsData().getDateStats().setNumDVs(v); - } else if (fName.equals("max")) { - DateWritable v = ((DateObjectInspector) oi).getPrimitiveWritableObject(o); - statsObj.getStatsData().getDateStats().setHighValue(new Date(v.getDays())); - } else if (fName.equals("min")) { - DateWritable v = ((DateObjectInspector) oi).getPrimitiveWritableObject(o); - statsObj.getStatsData().getDateStats().setLowValue(new Date(v.getDays())); - } else if (fName.equals("ndvbitvector")) { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; - byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); - statsObj.getStatsData().getDateStats().setBitVectors(buf); - } - } - - private void unpackPrimitiveObject (ObjectInspector oi, Object o, String fieldName, - ColumnStatisticsObj statsObj) throws UnsupportedDoubleException { - if (o == null) { - return; - } - // First infer the type of object - if (fieldName.equals("columntype")) { - PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; - String s = ((StringObjectInspector) poi).getPrimitiveJavaObject(o); - ColumnStatisticsData statsData = new ColumnStatisticsData(); - - if (s.equalsIgnoreCase("long")) { - LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector(); - statsData.setLongStats(longStats); - statsObj.setStatsData(statsData); - } else if (s.equalsIgnoreCase("double")) { - DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector(); - statsData.setDoubleStats(doubleStats); - statsObj.setStatsData(statsData); - } else if (s.equalsIgnoreCase("string")) { - StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector(); - statsData.setStringStats(stringStats); - statsObj.setStatsData(statsData); - } else if (s.equalsIgnoreCase("boolean")) { - BooleanColumnStatsData booleanStats = new BooleanColumnStatsData(); - statsData.setBooleanStats(booleanStats); - statsObj.setStatsData(statsData); - } else if (s.equalsIgnoreCase("binary")) { - BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); - statsData.setBinaryStats(binaryStats); - statsObj.setStatsData(statsData); - } else if (s.equalsIgnoreCase("decimal")) { - DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector(); - statsData.setDecimalStats(decimalStats); - statsObj.setStatsData(statsData); - } else if (s.equalsIgnoreCase("date")) { - DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector(); - statsData.setDateStats(dateStats); - statsObj.setStatsData(statsData); - } - } else { - // invoke the right unpack method depending on data type of the column - if (statsObj.getStatsData().isSetBooleanStats()) { - unpackBooleanStats(oi, o, fieldName, statsObj); - } else if (statsObj.getStatsData().isSetLongStats()) { - unpackLongStats(oi, o, fieldName, statsObj); - } else if (statsObj.getStatsData().isSetDoubleStats()) { - unpackDoubleStats(oi,o,fieldName, statsObj); - } else if (statsObj.getStatsData().isSetStringStats()) { - unpackStringStats(oi, o, fieldName, statsObj); - } else if (statsObj.getStatsData().isSetBinaryStats()) { - unpackBinaryStats(oi, o, fieldName, statsObj); - } else if (statsObj.getStatsData().isSetDecimalStats()) { - unpackDecimalStats(oi, o, fieldName, statsObj); - } else if (statsObj.getStatsData().isSetDateStats()) { - unpackDateStats(oi, o, fieldName, statsObj); - } - } - } - - private void unpackStructObject(ObjectInspector oi, Object o, String fName, - ColumnStatisticsObj cStatsObj) throws UnsupportedDoubleException { - if (oi.getCategory() != ObjectInspector.Category.STRUCT) { - throw new RuntimeException("Invalid object datatype : " + oi.getCategory().toString()); - } - - StructObjectInspector soi = (StructObjectInspector) oi; - List fields = soi.getAllStructFieldRefs(); - List list = soi.getStructFieldsDataAsList(o); - - for (int i = 0; i < fields.size(); i++) { - // Get the field objectInspector, fieldName and the field object. - ObjectInspector foi = fields.get(i).getFieldObjectInspector(); - Object f = (list == null ? null : list.get(i)); - String fieldName = fields.get(i).getFieldName(); - - if (foi.getCategory() == ObjectInspector.Category.PRIMITIVE) { - unpackPrimitiveObject(foi, f, fieldName, cStatsObj); - } else { - unpackStructObject(foi, f, fieldName, cStatsObj); - } - } - } - - private List constructColumnStatsFromPackedRows( - Hive db) throws HiveException, MetaException, IOException { - - String currentDb = SessionState.get().getCurrentDatabase(); - String tableName = work.getColStats().getTableName(); - String partName = null; - List colName = work.getColStats().getColName(); - List colType = work.getColStats().getColType(); - boolean isTblLevel = work.getColStats().isTblLevel(); - - List stats = new ArrayList(); - InspectableObject packedRow; - Table tbl = db.getTable(currentDb, tableName); - while ((packedRow = ftOp.getNextRow()) != null) { - if (packedRow.oi.getCategory() != ObjectInspector.Category.STRUCT) { - throw new HiveException("Unexpected object type encountered while unpacking row"); - } - - List statsObjs = new ArrayList(); - StructObjectInspector soi = (StructObjectInspector) packedRow.oi; - List fields = soi.getAllStructFieldRefs(); - List list = soi.getStructFieldsDataAsList(packedRow.o); - - List partColSchema = tbl.getPartCols(); - // Partition columns are appended at end, we only care about stats column - int numOfStatCols = isTblLevel ? fields.size() : fields.size() - partColSchema.size(); - for (int i = 0; i < numOfStatCols; i++) { - // Get the field objectInspector, fieldName and the field object. - ObjectInspector foi = fields.get(i).getFieldObjectInspector(); - Object f = (list == null ? null : list.get(i)); - String fieldName = fields.get(i).getFieldName(); - ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); - statsObj.setColName(colName.get(i)); - statsObj.setColType(colType.get(i)); - try { - unpackStructObject(foi, f, fieldName, statsObj); - statsObjs.add(statsObj); - } catch (UnsupportedDoubleException e) { - // due to infinity or nan. - LOG.info("Because " + colName.get(i) + " is infinite or NaN, we skip stats."); - } - } - - if (!isTblLevel) { - List partVals = new ArrayList(); - // Iterate over partition columns to figure out partition name - for (int i = fields.size() - partColSchema.size(); i < fields.size(); i++) { - Object partVal = ((PrimitiveObjectInspector)fields.get(i).getFieldObjectInspector()). - getPrimitiveJavaObject(list.get(i)); - partVals.add(partVal == null ? // could be null for default partition - this.conf.getVar(ConfVars.DEFAULTPARTITIONNAME) : partVal.toString()); - } - partName = Warehouse.makePartName(partColSchema, partVals); - } - String [] names = Utilities.getDbTableName(currentDb, tableName); - ColumnStatisticsDesc statsDesc = getColumnStatsDesc(names[0], names[1], partName, isTblLevel); - ColumnStatistics colStats = new ColumnStatistics(); - colStats.setStatsDesc(statsDesc); - colStats.setStatsObj(statsObjs); - if (!statsObjs.isEmpty()) { - stats.add(colStats); - } - } - ftOp.clearFetchContext(); - return stats; - } - - private ColumnStatisticsDesc getColumnStatsDesc(String dbName, String tableName, - String partName, boolean isTblLevel) - { - ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(); - statsDesc.setDbName(dbName); - statsDesc.setTableName(tableName); - statsDesc.setIsTblLevel(isTblLevel); - - if (!isTblLevel) { - statsDesc.setPartName(partName); - } else { - statsDesc.setPartName(null); - } - return statsDesc; - } - - private int persistColumnStats(Hive db) throws HiveException, MetaException, IOException { - // Construct a column statistics object from the result - List colStats = constructColumnStatsFromPackedRows(db); - // Persist the column statistics object to the metastore - // Note, this function is shared for both table and partition column stats. - if (colStats.isEmpty()) { - return 0; - } - SetPartitionsStatsRequest request = new SetPartitionsStatsRequest(colStats); - if (work.getColStats() != null && work.getColStats().getNumBitVector() > 0) { - request.setNeedMerge(true); - } - db.setPartitionColumnStatistics(request); - return 0; - } - - @Override - public int execute(DriverContext driverContext) { - if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) { - return 0; - } - try { - Hive db = getHive(); - return persistColumnStats(db); - } catch (Exception e) { - LOG.error("Failed to run column stats task", e); - } - return 1; - } - - @Override - public StageType getType() { - return StageType.COLUMNSTATS; - } - - @Override - public String getName() { - return "COLUMNSTATS TASK"; - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java index c22d69bb19..c0b796579e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/StatsTask.java @@ -16,493 +16,484 @@ * limitations under the License. */ - package org.apache.hadoop.hive.ql.exec; +import java.io.IOException; import java.io.Serializable; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collection; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.StatsSetupConst; -import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.Warehouse; -import org.apache.hadoop.hive.metastore.api.EnvironmentContext; +import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; +import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.Decimal; +import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest; +import org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector; +import org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector; +import org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector; +import org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector; +import org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector; +import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.DriverContext; -import org.apache.hadoop.hive.ql.ErrorMsg; -import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.QueryState; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; import org.apache.hadoop.hive.ql.parse.ExplainConfiguration.AnalyzeState; -import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; -import org.apache.hadoop.hive.ql.plan.LoadTableDesc; +import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.api.StageType; -import org.apache.hadoop.hive.ql.stats.StatsAggregator; -import org.apache.hadoop.hive.ql.stats.StatsCollectionContext; -import org.apache.hadoop.hive.ql.stats.StatsFactory; -import org.apache.hadoop.hive.ql.stats.StatsPublisher; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.StringUtils; - -import com.google.common.collect.Lists; -import com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * StatsTask implementation. StatsTask mainly deals with "collectable" stats. These are - * stats that require data scanning and are collected during query execution (unless the user - * explicitly requests data scanning just for the purpose of stats computation using the "ANALYZE" - * command. All other stats are computed directly by the MetaStore. The rationale being that the - * MetaStore layer covers all Thrift calls and provides better guarantees about the accuracy of - * those stats. + * StatsTask implementation. **/ -public class StatsTask extends Task implements Serializable { +public class StatsTask extends Task implements Serializable { private static final long serialVersionUID = 1L; + private FetchOperator ftOp; private static transient final Logger LOG = LoggerFactory.getLogger(StatsTask.class); - private Table table; - private Collection dpPartSpecs; - public StatsTask() { super(); - dpPartSpecs = null; } @Override - protected void receiveFeed(FeedType feedType, Object feedValue) { - // this method should be called by MoveTask when there are dynamic partitions generated - if (feedType == FeedType.DYNAMIC_PARTITIONS) { - dpPartSpecs = (Collection) feedValue; + public void initialize(QueryState queryState, QueryPlan queryPlan, DriverContext ctx, + CompilationOpContext opContext) { + super.initialize(queryState, queryPlan, ctx, opContext); + if (work.getfWork() != null) { + work.initializeForFetch(opContext); + try { + JobConf job = new JobConf(conf); + ftOp = new FetchOperator(work.getfWork(), job); + } catch (Exception e) { + LOG.error(StringUtils.stringifyException(e)); + throw new RuntimeException(e); + } } } - @Override - public int execute(DriverContext driverContext) { - if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) { - return 0; - } - LOG.info("Executing stats task"); - // Make sure that it is either an ANALYZE, INSERT OVERWRITE (maybe load) or CTAS command - short workComponentsPresent = 0; - if (work.getLoadTableDesc() != null) { - workComponentsPresent++; - } - if (work.getTableSpecs() != null) { - workComponentsPresent++; - } - if (work.getLoadFileDesc() != null) { - workComponentsPresent++; + private void unpackBooleanStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + long v = ((LongObjectInspector) oi).get(o); + if (fName.equals("counttrues")) { + statsObj.getStatsData().getBooleanStats().setNumTrues(v); + } else if (fName.equals("countfalses")) { + statsObj.getStatsData().getBooleanStats().setNumFalses(v); + } else if (fName.equals("countnulls")) { + statsObj.getStatsData().getBooleanStats().setNumNulls(v); } + } - assert (workComponentsPresent == 1); + @SuppressWarnings("serial") + class UnsupportedDoubleException extends Exception { + } - String tableName = ""; - Hive hive = getHive(); - try { - if (work.getLoadTableDesc() != null) { - tableName = work.getLoadTableDesc().getTable().getTableName(); - } else if (work.getTableSpecs() != null){ - tableName = work.getTableSpecs().tableName; - } else { - tableName = work.getLoadFileDesc().getDestinationCreateTable(); + private void unpackDoubleStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) throws UnsupportedDoubleException { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDoubleStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDoubleStats().setNumDVs(v); + } else if (fName.equals("max")) { + double d = ((DoubleObjectInspector) oi).get(o); + if (Double.isInfinite(d) || Double.isNaN(d)) { + throw new UnsupportedDoubleException(); } + statsObj.getStatsData().getDoubleStats().setHighValue(d); + } else if (fName.equals("min")) { + double d = ((DoubleObjectInspector) oi).get(o); + if (Double.isInfinite(d) || Double.isNaN(d)) { + throw new UnsupportedDoubleException(); + } + statsObj.getStatsData().getDoubleStats().setLowValue(d); + } else if (fName.equals("ndvbitvector")) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); + statsObj.getStatsData().getDoubleStats().setBitVectors(buf); + ; + } + } - table = hive.getTable(tableName); - - } catch (HiveException e) { - LOG.error("Cannot get table " + tableName, e); - console.printError("Cannot get table " + tableName, e.toString()); + private void unpackDecimalStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDecimalStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDecimalStats().setNumDVs(v); + } else if (fName.equals("max")) { + HiveDecimal d = ((HiveDecimalObjectInspector) oi).getPrimitiveJavaObject(o); + statsObj.getStatsData().getDecimalStats().setHighValue(convertToThriftDecimal(d)); + } else if (fName.equals("min")) { + HiveDecimal d = ((HiveDecimalObjectInspector) oi).getPrimitiveJavaObject(o); + statsObj.getStatsData().getDecimalStats().setLowValue(convertToThriftDecimal(d)); + } else if (fName.equals("ndvbitvector")) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); + statsObj.getStatsData().getDecimalStats().setBitVectors(buf); + ; } + } - return aggregateStats(hive); + private Decimal convertToThriftDecimal(HiveDecimal d) { + return new Decimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()); + } + private void unpackLongStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setNumDVs(v); + } else if (fName.equals("max")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setHighValue(v); + } else if (fName.equals("min")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getLongStats().setLowValue(v); + } else if (fName.equals("ndvbitvector")) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); + statsObj.getStatsData().getLongStats().setBitVectors(buf); + ; + } } - @Override - public StageType getType() { - return StageType.STATS; + private void unpackStringStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setNumDVs(v); + } else if (fName.equals("avglength")) { + double d = ((DoubleObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setAvgColLen(d); + } else if (fName.equals("maxlength")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getStringStats().setMaxColLen(v); + } else if (fName.equals("ndvbitvector")) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); + statsObj.getStatsData().getStringStats().setBitVectors(buf); + ; + } } - @Override - public String getName() { - return "STATS"; + private void unpackBinaryStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getBinaryStats().setNumNulls(v); + } else if (fName.equals("avglength")) { + double d = ((DoubleObjectInspector) oi).get(o); + statsObj.getStatsData().getBinaryStats().setAvgColLen(d); + } else if (fName.equals("maxlength")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getBinaryStats().setMaxColLen(v); + } } - private int aggregateStats(Hive db) { + private void unpackDateStats(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj statsObj) { + if (fName.equals("countnulls")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDateStats().setNumNulls(v); + } else if (fName.equals("numdistinctvalues")) { + long v = ((LongObjectInspector) oi).get(o); + statsObj.getStatsData().getDateStats().setNumDVs(v); + } else if (fName.equals("max")) { + DateWritable v = ((DateObjectInspector) oi).getPrimitiveWritableObject(o); + statsObj.getStatsData().getDateStats().setHighValue(new Date(v.getDays())); + } else if (fName.equals("min")) { + DateWritable v = ((DateObjectInspector) oi).getPrimitiveWritableObject(o); + statsObj.getStatsData().getDateStats().setLowValue(new Date(v.getDays())); + } else if (fName.equals("ndvbitvector")) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + byte[] buf = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(o); + statsObj.getStatsData().getDateStats().setBitVectors(buf); + ; + } + } - StatsAggregator statsAggregator = null; - int ret = 0; - StatsCollectionContext scc = null; - EnvironmentContext environmentContext = null; - try { - // Stats setup: - final Warehouse wh = new Warehouse(conf); - if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) { - try { - scc = getContext(); - statsAggregator = createStatsAggregator(scc, conf); - } catch (HiveException e) { - if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) { - throw e; - } - console.printError(ErrorMsg.STATS_SKIPPING_BY_ERROR.getErrorCodedMsg(e.toString())); - } + private void unpackPrimitiveObject(ObjectInspector oi, Object o, String fieldName, + ColumnStatisticsObj statsObj) throws UnsupportedDoubleException { + if (o == null) { + return; + } + // First infer the type of object + if (fieldName.equals("columntype")) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + String s = ((StringObjectInspector) poi).getPrimitiveJavaObject(o); + ColumnStatisticsData statsData = new ColumnStatisticsData(); + + if (s.equalsIgnoreCase("long")) { + LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector(); + statsData.setLongStats(longStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("double")) { + DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector(); + statsData.setDoubleStats(doubleStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("string")) { + StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector(); + statsData.setStringStats(stringStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("boolean")) { + BooleanColumnStatsData booleanStats = new BooleanColumnStatsData(); + statsData.setBooleanStats(booleanStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("binary")) { + BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); + statsData.setBinaryStats(binaryStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("decimal")) { + DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector(); + statsData.setDecimalStats(decimalStats); + statsObj.setStatsData(statsData); + } else if (s.equalsIgnoreCase("date")) { + DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector(); + statsData.setDateStats(dateStats); + statsObj.setStatsData(statsData); + } + } else { + // invoke the right unpack method depending on data type of the column + if (statsObj.getStatsData().isSetBooleanStats()) { + unpackBooleanStats(oi, o, fieldName, statsObj); + } else if (statsObj.getStatsData().isSetLongStats()) { + unpackLongStats(oi, o, fieldName, statsObj); + } else if (statsObj.getStatsData().isSetDoubleStats()) { + unpackDoubleStats(oi, o, fieldName, statsObj); + } else if (statsObj.getStatsData().isSetStringStats()) { + unpackStringStats(oi, o, fieldName, statsObj); + } else if (statsObj.getStatsData().isSetBinaryStats()) { + unpackBinaryStats(oi, o, fieldName, statsObj); + } else if (statsObj.getStatsData().isSetDecimalStats()) { + unpackDecimalStats(oi, o, fieldName, statsObj); + } else if (statsObj.getStatsData().isSetDateStats()) { + unpackDateStats(oi, o, fieldName, statsObj); } + } + } - List partitions = getPartitionsList(db); - boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC); - - String tableFullName = table.getDbName() + "." + table.getTableName(); - - if (partitions == null) { - org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable(); - Map parameters = tTable.getParameters(); - // In the following scenarios, we need to reset the stats to true. - // work.getTableSpecs() != null means analyze command - // work.getLoadTableDesc().getReplace() is true means insert overwrite command - // work.getLoadFileDesc().getDestinationCreateTable().isEmpty() means CTAS etc. - // acidTable will not have accurate stats unless it is set through analyze command. - if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) { - StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); - } else if (work.getTableSpecs() != null - || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) - || (work.getLoadFileDesc() != null && !work.getLoadFileDesc() - .getDestinationCreateTable().isEmpty())) { - StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE); - } - // non-partitioned tables: - if (!existStats(parameters) && atomic) { - return 0; - } + private void unpackStructObject(ObjectInspector oi, Object o, String fName, + ColumnStatisticsObj cStatsObj) throws UnsupportedDoubleException { + if (oi.getCategory() != ObjectInspector.Category.STRUCT) { + throw new RuntimeException("Invalid object datatype : " + oi.getCategory().toString()); + } - // The collectable stats for the aggregator needs to be cleared. - // For eg. if a file is being loaded, the old number of rows are not valid - if (work.isClearAggregatorStats()) { - // we choose to keep the invalid stats and only change the setting. - StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); - } + StructObjectInspector soi = (StructObjectInspector) oi; + List fields = soi.getAllStructFieldRefs(); + List list = soi.getStructFieldsDataAsList(o); - updateQuickStats(wh, parameters, tTable.getSd()); - if (StatsSetupConst.areBasicStatsUptoDate(parameters)) { - if (statsAggregator != null) { - String prefix = getAggregationPrefix(table, null); - updateStats(statsAggregator, parameters, prefix, atomic); - } - // write table stats to metastore - if (!getWork().getNoStatsAggregator()) { - environmentContext = new EnvironmentContext(); - environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, - StatsSetupConst.TASK); - } - } + for (int i = 0; i < fields.size(); i++) { + // Get the field objectInspector, fieldName and the field object. + ObjectInspector foi = fields.get(i).getFieldObjectInspector(); + Object f = (list == null ? null : list.get(i)); + String fieldName = fields.get(i).getFieldName(); - getHive().alterTable(tableFullName, new Table(tTable), environmentContext); - if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) { - console.printInfo("Table " + tableFullName + " stats: [" + toString(parameters) + ']'); - } - LOG.info("Table " + tableFullName + " stats: [" + toString(parameters) + ']'); + if (foi.getCategory() == ObjectInspector.Category.PRIMITIVE) { + unpackPrimitiveObject(foi, f, fieldName, cStatsObj); } else { - // Partitioned table: - // Need to get the old stats of the partition - // and update the table stats based on the old and new stats. - List updates = new ArrayList(); - - //Get the file status up-front for all partitions. Beneficial in cases of blob storage systems - final Map fileStatusMap = new ConcurrentHashMap(); - int poolSize = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 1); - // In case thread count is set to 0, use single thread. - poolSize = Math.max(poolSize, 1); - final ExecutorService pool = Executors.newFixedThreadPool(poolSize, - new ThreadFactoryBuilder().setDaemon(true) - .setNameFormat("stats-updater-thread-%d") - .build()); - final List> futures = Lists.newLinkedList(); - LOG.debug("Getting file stats of all partitions. threadpool size:" + poolSize); - try { - for(final Partition partn : partitions) { - final String partitionName = partn.getName(); - final org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); - Map parameters = tPart.getParameters(); - - if (!existStats(parameters) && atomic) { - continue; - } - futures.add(pool.submit(new Callable() { - @Override - public Void call() throws Exception { - FileStatus[] partfileStatus = wh.getFileStatusesForSD(tPart.getSd()); - fileStatusMap.put(partitionName, partfileStatus); - return null; - } - })); - } - pool.shutdown(); - for(Future future : futures) { - future.get(); - } - } catch (InterruptedException e) { - LOG.debug("Cancelling " + futures.size() + " file stats lookup tasks"); - //cancel other futures - for (Future future : futures) { - future.cancel(true); - } - // Fail the query if the stats are supposed to be reliable - if (work.isStatsReliable()) { - ret = 1; - } - } finally { - if (pool != null) { - pool.shutdownNow(); - } - LOG.debug("Finished getting file stats of all partitions"); - } - - for (Partition partn : partitions) { - // - // get the old partition stats - // - org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition(); - Map parameters = tPart.getParameters(); - if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) { - StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); - } else if (work.getTableSpecs() != null - || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) - || (work.getLoadFileDesc() != null && !work.getLoadFileDesc() - .getDestinationCreateTable().isEmpty())) { - StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE); - } - //only when the stats exist, it is added to fileStatusMap - if (!fileStatusMap.containsKey(partn.getName())) { - continue; - } - - // The collectable stats for the aggregator needs to be cleared. - // For eg. if a file is being loaded, the old number of rows are not valid - if (work.isClearAggregatorStats()) { - // we choose to keep the invalid stats and only change the setting. - StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE); - } - - updateQuickStats(parameters, fileStatusMap.get(partn.getName())); - if (StatsSetupConst.areBasicStatsUptoDate(parameters)) { - if (statsAggregator != null) { - String prefix = getAggregationPrefix(table, partn); - updateStats(statsAggregator, parameters, prefix, atomic); - } - if (!getWork().getNoStatsAggregator()) { - environmentContext = new EnvironmentContext(); - environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, - StatsSetupConst.TASK); - } - } - updates.add(new Partition(table, tPart)); - - if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) { - console.printInfo("Partition " + tableFullName + partn.getSpec() + - " stats: [" + toString(parameters) + ']'); - } - LOG.info("Partition " + tableFullName + partn.getSpec() + - " stats: [" + toString(parameters) + ']'); - } - if (!updates.isEmpty()) { - db.alterPartitions(tableFullName, updates, environmentContext); - } + unpackStructObject(foi, f, fieldName, cStatsObj); } + } + } - } catch (Exception e) { - console.printInfo("[Warning] could not update stats.", - "Failed with exception " + e.getMessage() + "\n" - + StringUtils.stringifyException(e)); + private List constructColumnStatsFromPackedRows(Hive db) throws HiveException, + MetaException, IOException { + + String currentDb = SessionState.get().getCurrentDatabase(); + String tableName = work.getColStats().getTableName(); + String partName = null; + List colName = work.getColStats().getColName(); + List colType = work.getColStats().getColType(); + boolean isTblLevel = work.getColStats().isTblLevel(); + + List stats = new ArrayList(); + InspectableObject packedRow; + Table tbl = db.getTable(currentDb, tableName); + while ((packedRow = ftOp.getNextRow()) != null) { + if (packedRow.oi.getCategory() != ObjectInspector.Category.STRUCT) { + throw new HiveException("Unexpected object type encountered while unpacking row"); + } - // Fail the query if the stats are supposed to be reliable - if (work.isStatsReliable()) { - ret = 1; + List statsObjs = new ArrayList(); + StructObjectInspector soi = (StructObjectInspector) packedRow.oi; + List fields = soi.getAllStructFieldRefs(); + List list = soi.getStructFieldsDataAsList(packedRow.o); + + List partColSchema = tbl.getPartCols(); + // Partition columns are appended at end, we only care about stats column + int numOfStatCols = isTblLevel ? fields.size() : fields.size() - partColSchema.size(); + for (int i = 0; i < numOfStatCols; i++) { + // Get the field objectInspector, fieldName and the field object. + ObjectInspector foi = fields.get(i).getFieldObjectInspector(); + Object f = (list == null ? null : list.get(i)); + String fieldName = fields.get(i).getFieldName(); + ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); + statsObj.setColName(colName.get(i)); + statsObj.setColType(colType.get(i)); + try { + unpackStructObject(foi, f, fieldName, statsObj); + statsObjs.add(statsObj); + } catch (UnsupportedDoubleException e) { + // due to infinity or nan. + LOG.info("Because " + colName.get(i) + " is infinite or NaN, we skip stats."); + } } - } finally { - if (statsAggregator != null) { - statsAggregator.closeConnection(scc); + + if (!isTblLevel) { + List partVals = new ArrayList(); + // Iterate over partition columns to figure out partition name + for (int i = fields.size() - partColSchema.size(); i < fields.size(); i++) { + Object partVal = ((PrimitiveObjectInspector) fields.get(i).getFieldObjectInspector()) + .getPrimitiveJavaObject(list.get(i)); + partVals.add(partVal == null ? // could be null for default partition + this.conf.getVar(ConfVars.DEFAULTPARTITIONNAME) + : partVal.toString()); + } + partName = Warehouse.makePartName(partColSchema, partVals); + } + String[] names = Utilities.getDbTableName(currentDb, tableName); + ColumnStatisticsDesc statsDesc = getColumnStatsDesc(names[0], names[1], partName, isTblLevel); + ColumnStatistics colStats = new ColumnStatistics(); + colStats.setStatsDesc(statsDesc); + colStats.setStatsObj(statsObjs); + if (!colStats.getStatsObj().isEmpty()) { + stats.add(colStats); } } - // The return value of 0 indicates success, - // anything else indicates failure - return ret; + ftOp.clearFetchContext(); + return stats; } - private String getAggregationPrefix(Table table, Partition partition) - throws MetaException { - - // prefix is of the form dbName.tblName - String prefix = table.getDbName() + "." + MetaStoreUtils.encodeTableName(table.getTableName()); - if (partition != null) { - return Utilities.join(prefix, Warehouse.makePartPath(partition.getSpec())); + private ColumnStatisticsDesc getColumnStatsDesc(String dbName, String tableName, String partName, + boolean isTblLevel) { + ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(); + statsDesc.setDbName(dbName); + statsDesc.setTableName(tableName); + statsDesc.setIsTblLevel(isTblLevel); + + if (!isTblLevel) { + statsDesc.setPartName(partName); + } else { + statsDesc.setPartName(null); } - return prefix; + return statsDesc; } - private StatsAggregator createStatsAggregator(StatsCollectionContext scc, HiveConf conf) throws HiveException { - String statsImpl = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS); - StatsFactory factory = StatsFactory.newFactory(statsImpl, conf); - if (factory == null) { - throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg()); - } - // initialize stats publishing table for noscan which has only stats task - // the rest of MR task following stats task initializes it in ExecDriver.java - StatsPublisher statsPublisher = factory.getStatsPublisher(); - if (!statsPublisher.init(scc)) { // creating stats table if not exists - throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg()); - } - - // manufacture a StatsAggregator - StatsAggregator statsAggregator = factory.getStatsAggregator(); - if (!statsAggregator.connect(scc)) { - throw new HiveException(ErrorMsg.STATSAGGREGATOR_CONNECTION_ERROR.getErrorCodedMsg(statsImpl)); + private int persistColumnStats(Hive db) throws HiveException, MetaException, IOException { + // Construct a column statistics object from the result + List colStats = constructColumnStatsFromPackedRows(db); + // Persist the column statistics object to the metastore + // Note, this function is shared for both table and partition column stats. + if (colStats.isEmpty()) { + return 0; } - return statsAggregator; + SetPartitionsStatsRequest request = new SetPartitionsStatsRequest(colStats); + request.setNeedMerge(work.getColStats().isNeedMerge()); + db.setPartitionColumnStatistics(request); + return 0; } - private StatsCollectionContext getContext() throws HiveException { - - StatsCollectionContext scc = new StatsCollectionContext(conf); - Task sourceTask = getWork().getSourceTask(); - if (sourceTask == null) { - throw new HiveException(ErrorMsg.STATSAGGREGATOR_SOURCETASK_NULL.getErrorCodedMsg()); + @Override + public int execute(DriverContext driverContext) { + if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) { + return 0; } - scc.setTask(sourceTask); - scc.setStatsTmpDir(this.getWork().getStatsTmpDir()); - return scc; - } - - private boolean existStats(Map parameters) { - return parameters.containsKey(StatsSetupConst.ROW_COUNT) - || parameters.containsKey(StatsSetupConst.NUM_FILES) - || parameters.containsKey(StatsSetupConst.TOTAL_SIZE) - || parameters.containsKey(StatsSetupConst.RAW_DATA_SIZE) - || parameters.containsKey(StatsSetupConst.NUM_PARTITIONS); - } - private void updateStats(StatsAggregator statsAggregator, - Map parameters, String prefix, boolean atomic) - throws HiveException { - - String aggKey = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR; - - for (String statType : StatsSetupConst.statsRequireCompute) { - String value = statsAggregator.aggregateStats(aggKey, statType); - if (value != null && !value.isEmpty()) { - long longValue = Long.parseLong(value); + // TODO: merge BasicStatsWork and BasicStatsNoJobWork + if (work.getBasicStatsWork() != null && work.getBasicStatsNoJobWork() != null) { + LOG.error("Can not have both basic stats work and stats no job work!"); + return 1; + } + int ret = 0; + if (work.getBasicStatsWork() != null) { + work.getBasicStatsWork().setFollowedByColStats(work.getfWork() != null); + Task basicStatsTask = TaskFactory.get(work.getBasicStatsWork(), conf); + basicStatsTask.initialize(queryState, queryPlan, driverContext, null); + ((BasicStatsTask) basicStatsTask).setDpPartSpecs(dpPartSpecs); + ret = ((BasicStatsTask) basicStatsTask).execute(driverContext); + } + if (work.getBasicStatsNoJobWork() != null) { + Task basicStatsTask = TaskFactory.get(work.getBasicStatsNoJobWork(), + conf); + basicStatsTask.initialize(queryState, queryPlan, driverContext, null); + ret = ((BasicStatsNoJobTask) basicStatsTask).execute(driverContext); + } + if (ret != 0) { + return ret; + } - if (work.getLoadTableDesc() != null && - !work.getLoadTableDesc().getReplace()) { - String originalValue = parameters.get(statType); - if (originalValue != null) { - longValue += Long.parseLong(originalValue); // todo: invalid + valid = invalid - } - } - parameters.put(statType, String.valueOf(longValue)); - } else { - if (atomic) { - throw new HiveException(ErrorMsg.STATSAGGREGATOR_MISSED_SOMESTATS, statType); - } + if (work.getfWork() != null) { + try { + Hive db = getHive(); + return persistColumnStats(db); + } catch (Exception e) { + LOG.error("Failed to run column stats task", e); + return 1; } } + return 0; } - private void updateQuickStats(Warehouse wh, Map parameters, - StorageDescriptor desc) throws MetaException { - /** - * calculate fast statistics - */ - FileStatus[] partfileStatus = wh.getFileStatusesForSD(desc); - updateQuickStats(parameters, partfileStatus); - } - - private void updateQuickStats(Map parameters, - FileStatus[] partfileStatus) throws MetaException { - MetaStoreUtils.populateQuickStats(partfileStatus, parameters); + @Override + public StageType getType() { + return StageType.COLUMNSTATS; } - private String toString(Map parameters) { - StringBuilder builder = new StringBuilder(); - for (String statType : StatsSetupConst.supportedStats) { - String value = parameters.get(statType); - if (value != null) { - if (builder.length() > 0) { - builder.append(", "); - } - builder.append(statType).append('=').append(value); - } - } - return builder.toString(); + @Override + public String getName() { + return "COLUMNSTATS TASK"; } - /** - * Get the list of partitions that need to update statistics. - * TODO: we should reuse the Partitions generated at compile time - * since getting the list of partitions is quite expensive. - * - * @return a list of partitions that need to update statistics. - * @throws HiveException - */ - private List getPartitionsList(Hive db) throws HiveException { - if (work.getLoadFileDesc() != null) { - return null; //we are in CTAS, so we know there are no partitions - } - - List list = new ArrayList(); - - if (work.getTableSpecs() != null) { - - // ANALYZE command - TableSpec tblSpec = work.getTableSpecs(); - table = tblSpec.tableHandle; - if (!table.isPartitioned()) { - return null; - } - // get all partitions that matches with the partition spec - List partitions = tblSpec.partitions; - if (partitions != null) { - for (Partition partn : partitions) { - list.add(partn); - } - } - } else if (work.getLoadTableDesc() != null) { + private Collection dpPartSpecs; - // INSERT OVERWRITE command - LoadTableDesc tbd = work.getLoadTableDesc(); - table = db.getTable(tbd.getTable().getTableName()); - if (!table.isPartitioned()) { - return null; - } - DynamicPartitionCtx dpCtx = tbd.getDPCtx(); - if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions - // If no dynamic partitions are generated, dpPartSpecs may not be initialized - if (dpPartSpecs != null) { - // load the list of DP partitions and return the list of partition specs - list.addAll(dpPartSpecs); - } - } else { // static partition - Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false); - list.add(partn); - } + @Override + protected void receiveFeed(FeedType feedType, Object feedValue) { + // this method should be called by MoveTask when there are dynamic + // partitions generated + if (feedType == FeedType.DYNAMIC_PARTITIONS) { + dpPartSpecs = (Collection) feedValue; } - return list; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java index fe9b6244df..abfdd867b8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java @@ -40,7 +40,7 @@ import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanTask; import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork; import org.apache.hadoop.hive.ql.plan.ColumnStatsUpdateWork; -import org.apache.hadoop.hive.ql.plan.ColumnStatsWork; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.ConditionalWork; import org.apache.hadoop.hive.ql.plan.CopyWork; import org.apache.hadoop.hive.ql.plan.DDLWork; @@ -54,8 +54,8 @@ import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.ReplCopyWork; import org.apache.hadoop.hive.ql.plan.SparkWork; -import org.apache.hadoop.hive.ql.plan.StatsNoJobWork; -import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.hive.ql.plan.TezWork; /** @@ -99,10 +99,9 @@ public TaskTuple(Class workClass, Class> taskClass) { taskvec.add(new TaskTuple(MapredLocalWork.class, MapredLocalTask.class)); - taskvec.add(new TaskTuple(StatsWork.class, - StatsTask.class)); - taskvec.add(new TaskTuple(StatsNoJobWork.class, StatsNoJobTask.class)); - taskvec.add(new TaskTuple(ColumnStatsWork.class, ColumnStatsTask.class)); + taskvec.add(new TaskTuple(BasicStatsWork.class, BasicStatsTask.class)); + taskvec.add(new TaskTuple(BasicStatsNoJobWork.class, BasicStatsNoJobTask.class)); + taskvec.add(new TaskTuple(StatsWork.class, StatsTask.class)); taskvec.add(new TaskTuple(ColumnStatsUpdateWork.class, ColumnStatsUpdateTask.class)); taskvec.add(new TaskTuple(MergeFileWork.class, MergeFileTask.class)); diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 9f98b69b18..a7befd0b2e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -1741,7 +1741,9 @@ public Partition loadPartition(Path loadPath, Table tbl, } // column stats will be inaccurate - StatsSetupConst.clearColumnStatsState(newTPart.getParameters()); + if (!hasFollowingStatsTask) { + StatsSetupConst.clearColumnStatsState(newTPart.getParameters()); + } // recreate the partition if it existed before if (isSkewedStoreAsSubdir) { @@ -1760,8 +1762,8 @@ public Partition loadPartition(Path loadPath, Table tbl, if (oldPart == null) { newTPart.getTPartition().setParameters(new HashMap()); if (this.getConf().getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { - StatsSetupConst.setStatsStateForCreateTable(newTPart.getParameters(), null, - StatsSetupConst.TRUE); + StatsSetupConst.setStatsStateForCreateTable(newTPart.getParameters(), + MetaStoreUtils.getColumnNames(tbl.getCols()), StatsSetupConst.TRUE); } MetaStoreUtils.populateQuickStats(HiveStatsUtils.getFileStatusRecurse(newPartPath, -1, newPartPath.getFileSystem(conf)), newTPart.getParameters()); try { @@ -2123,7 +2125,9 @@ public void loadTable(Path loadPath, String tableName, boolean replace, boolean } //column stats will be inaccurate - StatsSetupConst.clearColumnStatsState(tbl.getParameters()); + if (!hasFollowingStatsTask) { + StatsSetupConst.clearColumnStatsState(tbl.getParameters()); + } try { if (isSkewedStoreAsSubdir) { diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java index 61f6a7c4ff..e49c5ae3d5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java @@ -434,6 +434,8 @@ private void createTempTable(org.apache.hadoop.hive.metastore.api.Table tbl, // Add temp table info to current session Table tTable = new Table(tbl); + StatsSetupConst.setStatsStateForCreateTable(tbl.getParameters(), + MetaStoreUtils.getColumnNamesForTable(tbl), StatsSetupConst.TRUE); if (tables == null) { tables = new HashMap(); ss.getTempTables().put(dbName, tables); @@ -711,6 +713,13 @@ private boolean updateTempTableColumnStats(String dbName, String tableName, ssTableColStats); } mergeColumnStats(ssTableColStats, colStats); + + List colNames = new ArrayList<>(); + for (ColumnStatisticsObj obj : colStats.getStatsObj()) { + colNames.add(obj.getColName()); + } + org.apache.hadoop.hive.metastore.api.Table table = getTempTable(dbName, tableName); + StatsSetupConst.setColumnStatsState(table.getParameters(), colNames); return true; } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java index 9297a0b874..9f223af877 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.DriverContext; +import org.apache.hadoop.hive.ql.exec.StatsTask; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; @@ -45,10 +46,11 @@ import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.StatsNoJobWork; -import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.mapred.InputFormat; /** @@ -99,7 +101,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // There will not be any MR or Tez job above this task - StatsNoJobWork snjWork = new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec()); + BasicStatsNoJobWork snjWork = new BasicStatsNoJobWork(op.getConf().getTableMetadata().getTableSpec()); snjWork.setStatsReliable(parseCtx.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); // If partition is specified, get pruned partition list @@ -111,7 +113,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, partCols, false); snjWork.setPrunedPartitionList(partList); } - Task snjTask = TaskFactory.get(snjWork, parseCtx.getConf()); + Task snjTask = TaskFactory.get(snjWork, parseCtx.getConf()); ctx.setCurrTask(snjTask); ctx.setCurrTopOp(null); ctx.getRootTasks().clear(); @@ -121,14 +123,15 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, // The plan consists of a simple MapRedTask followed by a StatsTask. // The MR task is just a simple TableScanOperator - StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec()); + BasicStatsWork statsWork = new BasicStatsWork(op.getConf().getTableMetadata().getTableSpec()); statsWork.setAggKey(op.getConf().getStatsAggPrefix()); statsWork.setStatsTmpDir(op.getConf().getTmpStatsDir()); statsWork.setSourceTask(currTask); statsWork.setStatsReliable(parseCtx.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - Task statsTask = TaskFactory.get(statsWork, parseCtx.getConf()); - currTask.addDependentTask(statsTask); + StatsWork columnStatsWork = new StatsWork(statsWork); + Task columnStatsTask = TaskFactory.get(columnStatsWork, parseCtx.getConf()); + currTask.addDependentTask(columnStatsTask); if (!ctx.getRootTasks().contains(currTask)) { ctx.getRootTasks().add(currTask); } @@ -136,15 +139,15 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // The plan consists of a StatsTask only. if (noScan) { - statsTask.setParentTasks(null); + columnStatsTask.setParentTasks(null); statsWork.setNoScanAnalyzeCommand(true); ctx.getRootTasks().remove(currTask); - ctx.getRootTasks().add(statsTask); + ctx.getRootTasks().add(columnStatsTask); } // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; if (partialScan) { - handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask); + handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, columnStatsTask); } currWork.getMapWork().setGatheringStats(true); @@ -188,7 +191,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx opProcCtx, */ private void handlePartialScanCommand(TableScanOperator op, GenMRProcContext ctx, ParseContext parseCtx, Task currTask, - StatsWork statsWork, Task statsTask) throws SemanticException { + BasicStatsWork statsWork, Task statsTask) throws SemanticException { String aggregationKey = op.getConf().getStatsAggPrefix(); StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey); List inputPaths = GenMapRedUtils.getInputPathsForPartialScan(op, aggregationKeyBuffer); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index da153e36d2..0286f1343a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -35,6 +35,7 @@ import java.util.Set; import com.google.common.annotations.VisibleForTesting; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.BlobStorageUtils; @@ -88,6 +89,7 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles; import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx; import org.apache.hadoop.hive.ql.plan.ConditionalWork; @@ -111,7 +113,7 @@ import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SparkWork; -import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.TezWork; @@ -498,6 +500,10 @@ public static void setMapWork(MapWork plan, ParseContext parseCtx, Set currTask, HiveConf hconf) { MoveWork mvWork = mvTask.getWork(); - StatsWork statsWork = null; + BasicStatsWork statsWork = null; if (mvWork.getLoadTableWork() != null) { - statsWork = new StatsWork(mvWork.getLoadTableWork()); + statsWork = new BasicStatsWork(mvWork.getLoadTableWork()); } else if (mvWork.getLoadFileWork() != null) { - statsWork = new StatsWork(mvWork.getLoadFileWork()); + statsWork = new BasicStatsWork(mvWork.getLoadFileWork()); } assert statsWork != null : "Error when generating StatsTask"; @@ -1506,7 +1512,8 @@ public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix // in FileSinkDesc is used for stats publishing. They should be consistent. statsWork.setAggKey(nd.getConf().getStatsAggPrefix()); - Task statsTask = TaskFactory.get(statsWork, hconf); + StatsWork columnStatsWork = new StatsWork(statsWork); + Task statsTask = TaskFactory.get(columnStatsWork, hconf); // subscribe feeds from the MoveTask so that MoveTask can forward the list // of dynamic partition list to the StatsTask diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MemoryDecider.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MemoryDecider.java index 3a20cfe7ac..eae9505709 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MemoryDecider.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/MemoryDecider.java @@ -35,10 +35,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.StatsTask; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.StatsTask; +import org.apache.hadoop.hive.ql.exec.BasicStatsTask; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.tez.DagUtils; import org.apache.hadoop.hive.ql.exec.tez.TezTask; @@ -92,8 +93,9 @@ public MemoryCalculator(PhysicalContext pctx) { public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) throws SemanticException { Task currTask = (Task) nd; - if (currTask instanceof StatsTask) { - currTask = ((StatsTask) currTask).getWork().getSourceTask(); + if (currTask instanceof StatsTask + && ((StatsTask) currTask).getWork().getBasicStatsWork() != null) { + currTask = ((StatsTask) currTask).getWork().getBasicStatsWork().getSourceTask(); } if (currTask instanceof TezTask) { TezWork work = ((TezTask) currTask).getWork(); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SerializeFilter.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SerializeFilter.java index dc433fed22..f3c6daedfc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SerializeFilter.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/SerializeFilter.java @@ -26,8 +26,9 @@ import java.util.Set; import java.util.Stack; -import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.exec.StatsTask; +import org.apache.hadoop.hive.ql.exec.SerializationUtilities; +import org.apache.hadoop.hive.ql.exec.BasicStatsTask; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.tez.TezTask; @@ -71,8 +72,9 @@ public Serializer(PhysicalContext pctx) { public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) throws SemanticException { Task currTask = (Task) nd; - if (currTask instanceof StatsTask) { - currTask = ((StatsTask) currTask).getWork().getSourceTask(); + if (currTask instanceof StatsTask + && ((StatsTask) currTask).getWork().getBasicStatsWork() != null) { + currTask = ((StatsTask) currTask).getWork().getBasicStatsWork().getSourceTask(); } if (currTask instanceof TezTask) { TezWork work = ((TezTask) currTask).getWork(); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java index 251decac9b..9e2015221e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java @@ -91,6 +91,7 @@ import org.apache.hadoop.hive.ql.plan.CacheMetadataDesc; import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc; import org.apache.hadoop.hive.ql.plan.ColumnStatsUpdateWork; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.CreateDatabaseDesc; import org.apache.hadoop.hive.ql.plan.CreateIndexDesc; import org.apache.hadoop.hive.ql.plan.DDLWork; @@ -130,7 +131,7 @@ import org.apache.hadoop.hive.ql.plan.ShowTablesDesc; import org.apache.hadoop.hive.ql.plan.ShowTblPropertiesDesc; import org.apache.hadoop.hive.ql.plan.ShowTxnsDesc; -import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.hive.ql.plan.SwitchDatabaseDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TruncateTableDesc; @@ -1091,18 +1092,19 @@ private void analyzeTruncateTable(ASTNode ast) throws SemanticException { // Recalculate the HDFS stats if auto gather stats is set if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { - StatsWork statDesc; + BasicStatsWork basicStatsWork; if (oldTblPartLoc.equals(newTblPartLoc)) { // If we're merging to the same location, we can avoid some metastore calls TableSpec tablepart = new TableSpec(this.db, conf, root); - statDesc = new StatsWork(tablepart); + basicStatsWork = new BasicStatsWork(tablepart); } else { - statDesc = new StatsWork(ltd); + basicStatsWork = new BasicStatsWork(ltd); } - statDesc.setNoStatsAggregator(true); - statDesc.setClearAggregatorStats(true); - statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - Task statTask = TaskFactory.get(statDesc, conf); + basicStatsWork.setNoStatsAggregator(true); + basicStatsWork.setClearAggregatorStats(true); + basicStatsWork.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); + StatsWork columnStatsWork = new StatsWork(basicStatsWork); + Task statTask = TaskFactory.get(columnStatsWork, conf); moveTsk.addDependentTask(statTask); } } catch (HiveException e) { @@ -1724,18 +1726,19 @@ private void analyzeAlterTablePartMergeFiles(ASTNode ast, mergeTask.addDependentTask(moveTsk); if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { - StatsWork statDesc; + BasicStatsWork basicStatsWork; if (oldTblPartLoc.equals(newTblPartLoc)) { // If we're merging to the same location, we can avoid some metastore calls TableSpec tableSpec = new TableSpec(db, tableName, partSpec); - statDesc = new StatsWork(tableSpec); + basicStatsWork = new BasicStatsWork(tableSpec); } else { - statDesc = new StatsWork(ltd); + basicStatsWork = new BasicStatsWork(ltd); } - statDesc.setNoStatsAggregator(true); - statDesc.setClearAggregatorStats(true); - statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - Task statTask = TaskFactory.get(statDesc, conf); + basicStatsWork.setNoStatsAggregator(true); + basicStatsWork.setClearAggregatorStats(true); + basicStatsWork.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); + StatsWork columnStatsWork = new StatsWork(basicStatsWork); + Task statTask = TaskFactory.get(columnStatsWork, conf); moveTsk.addDependentTask(statTask); } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java index 7a0d4a752e..fbef0dd837 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ExplainSemanticAnalyzer.java @@ -40,7 +40,7 @@ import org.apache.hadoop.hive.ql.QueryState; import org.apache.hadoop.hive.ql.exec.ExplainTask; import org.apache.hadoop.hive.ql.exec.FetchTask; -import org.apache.hadoop.hive.ql.exec.StatsTask; +import org.apache.hadoop.hive.ql.exec.BasicStatsTask; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java index 40d4fad314..39c027a8cf 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java @@ -205,10 +205,6 @@ protected void setupMapWork(MapWork mapWork, GenTezProcContext context, // All the setup is done in GenMapRedUtils GenMapRedUtils.setMapWork(mapWork, context.parseContext, context.inputs, partitions, root, alias, context.conf, false); - // we also collect table stats while collecting column stats. - if (context.parseContext.getAnalyzeRewrite() != null) { - mapWork.setGatheringStats(true); - } } // removes any union operator and clones the plan diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java index fa79700df7..6a7b3db17a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.parse; import org.apache.hadoop.hive.conf.HiveConf.StrictChecks; - import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import java.io.IOException; @@ -51,9 +50,10 @@ import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.LoadTableDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; -import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.mapred.InputFormat; import com.google.common.collect.Lists; @@ -301,11 +301,12 @@ public void analyzeInternal(ASTNode ast) throws SemanticException { // Update the stats which do not require a complete scan. Task statTask = null; if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { - StatsWork statDesc = new StatsWork(loadTableWork); - statDesc.setNoStatsAggregator(true); - statDesc.setClearAggregatorStats(true); - statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - statTask = TaskFactory.get(statDesc, conf); + BasicStatsWork basicStatsWork = new BasicStatsWork(loadTableWork); + basicStatsWork.setNoStatsAggregator(true); + basicStatsWork.setClearAggregatorStats(true); + basicStatsWork.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); + StatsWork columnStatsWork = new StatsWork(basicStatsWork); + statTask = TaskFactory.get(columnStatsWork, conf); } // HIVE-3334 has been filed for load file with index auto update diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java index b6d7ee8a92..abc7ed29a4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ProcessAnalyzeTable.java @@ -44,9 +44,10 @@ import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; -import org.apache.hadoop.hive.ql.plan.MapWork; -import org.apache.hadoop.hive.ql.plan.StatsNoJobWork; import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.hive.ql.plan.TezWork; import org.apache.hadoop.mapred.InputFormat; @@ -103,7 +104,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procContext, // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // There will not be any Tez job above this task - StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata() + BasicStatsNoJobWork snjWork = new BasicStatsNoJobWork(tableScan.getConf().getTableMetadata() .getTableSpec()); snjWork.setStatsReliable(parseContext.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); @@ -116,7 +117,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procContext, false); snjWork.setPrunedPartitionList(partList); } - Task snjTask = TaskFactory.get(snjWork, parseContext.getConf()); + Task snjTask = TaskFactory.get(snjWork, parseContext.getConf()); snjTask.setParentTasks(null); context.rootTasks.remove(context.currentTask); context.rootTasks.add(snjTask); @@ -127,27 +128,28 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procContext, // The plan consists of a simple TezTask followed by a StatsTask. // The Tez task is just a simple TableScanOperator - StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); - statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); - statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); - statsWork.setSourceTask(context.currentTask); - statsWork.setStatsReliable(parseContext.getConf().getBoolVar( + BasicStatsWork basicStatsWork = new BasicStatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); + basicStatsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); + basicStatsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); + basicStatsWork.setSourceTask(context.currentTask); + basicStatsWork.setStatsReliable(parseContext.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - Task statsTask = TaskFactory.get(statsWork, parseContext.getConf()); + StatsWork columnStatsWork = new StatsWork(basicStatsWork); + Task statsTask = TaskFactory.get(columnStatsWork, parseContext.getConf()); context.currentTask.addDependentTask(statsTask); // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // The plan consists of a StatsTask only. if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) { statsTask.setParentTasks(null); - statsWork.setNoScanAnalyzeCommand(true); + columnStatsWork.getBasicStatsWork().setNoScanAnalyzeCommand(true); context.rootTasks.remove(context.currentTask); context.rootTasks.add(statsTask); } // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) { - handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask); + handlePartialScanCommand(tableScan, parseContext, basicStatsWork, context, statsTask); } // NOTE: here we should use the new partition predicate pushdown API to @@ -166,65 +168,18 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procContext, return true; } - } else if (parseContext.getAnalyzeRewrite() != null) { - // we need to collect table stats while collecting column stats. - try { - context.currentTask.addDependentTask(genTableStats(context, tableScan)); - } catch (HiveException e) { - throw new SemanticException(e); - } } return null; } - private Task genTableStats(GenTezProcContext context, TableScanOperator tableScan) - throws HiveException { - Class inputFormat = tableScan.getConf().getTableMetadata() - .getInputFormatClass(); - ParseContext parseContext = context.parseContext; - Table table = tableScan.getConf().getTableMetadata(); - List partitions = new ArrayList<>(); - if (table.isPartitioned()) { - partitions.addAll(parseContext.getPrunedPartitions(tableScan).getPartitions()); - for (Partition partn : partitions) { - LOG.debug("XXX: adding part: " + partn); - context.outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK)); - } - } - TableSpec tableSpec = new TableSpec(table, partitions); - tableScan.getConf().getTableMetadata().setTableSpec(tableSpec); - - if (inputFormat.equals(OrcInputFormat.class)) { - // For ORC, there is no Tez Job for table stats. - StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata() - .getTableSpec()); - snjWork.setStatsReliable(parseContext.getConf().getBoolVar( - HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - // If partition is specified, get pruned partition list - if (partitions.size() > 0) { - snjWork.setPrunedPartitionList(parseContext.getPrunedPartitions(tableScan)); - } - return TaskFactory.get(snjWork, parseContext.getConf()); - } else { - - StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); - statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); - statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); - statsWork.setSourceTask(context.currentTask); - statsWork.setStatsReliable(parseContext.getConf().getBoolVar( - HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - return TaskFactory.get(statsWork, parseContext.getConf()); - } - } - /** * handle partial scan command. * * It is composed of PartialScanTask followed by StatsTask. */ private void handlePartialScanCommand(TableScanOperator tableScan, ParseContext parseContext, - StatsWork statsWork, GenTezProcContext context, Task statsTask) + BasicStatsWork statsWork, GenTezProcContext context, Task statsTask) throws SemanticException { String aggregationKey = tableScan.getConf().getStatsAggPrefix(); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 1c74779dec..de07ef24d5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -7330,7 +7330,8 @@ protected Operator genFileSinkPlan(String dest, QB qb, Operator input) // the following code is used to collect column stats when // hive.stats.autogather=true // and it is an insert overwrite or insert into table - if (dest_tab != null && conf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER) + if (dest_tab != null && !dest_tab.isNonNative() + && conf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER) && conf.getBoolVar(ConfVars.HIVESTATSCOLAUTOGATHER) && ColumnStatsAutoGatherContext.canRunAutogatherStats(fso)) { if (dest_type.intValue() == QBMetaData.DEST_TABLE) { @@ -10474,10 +10475,7 @@ private void setupStats(TableScanDesc tsDesc, QBParseInfo qbp, Table tab, String throws SemanticException { // if it is not analyze command and not column stats, then do not gatherstats - // if it is column stats, but it is not tez, do not gatherstats - if ((!qbp.isAnalyzeCommand() && qbp.getAnalyzeRewrite() == null) - || (qbp.getAnalyzeRewrite() != null && !HiveConf.getVar(conf, - HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez"))) { + if (!qbp.isAnalyzeCommand() && qbp.getAnalyzeRewrite() == null) { tsDesc.setGatherStats(false); } else { if (HiveConf.getVar(conf, HIVESTATSDBCLASS).equalsIgnoreCase(StatDB.fs.name())) { diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java index 08a8f00e06..9ef2fe367d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java @@ -21,11 +21,14 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Queue; import java.util.Set; import java.util.Stack; @@ -35,15 +38,17 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.HiveStatsUtils; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.QueryState; -import org.apache.hadoop.hive.ql.exec.ColumnStatsTask; +import org.apache.hadoop.hive.ql.exec.StatsTask; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.StatsTask; +import org.apache.hadoop.hive.ql.exec.BasicStatsTask; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -51,13 +56,19 @@ import org.apache.hadoop.hive.ql.exec.spark.SparkTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; import org.apache.hadoop.hive.ql.optimizer.physical.AnnotateRunTimeStatsOptimizer; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; +import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc; -import org.apache.hadoop.hive.ql.plan.ColumnStatsWork; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.CreateTableDesc; import org.apache.hadoop.hive.ql.plan.CreateViewDesc; import org.apache.hadoop.hive.ql.plan.DDLWork; @@ -76,6 +87,7 @@ import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.thrift.ThriftFormatter; import org.apache.hadoop.hive.serde2.thrift.ThriftJDBCBinarySerDe; +import org.apache.hadoop.mapred.InputFormat; import com.google.common.collect.Interner; import com.google.common.collect.Interners; @@ -294,18 +306,53 @@ public void compile(final ParseContext pCtx, final List> leafTasks = new LinkedHashSet>(); - getLeafTasks(rootTasks, leafTasks); + // map from tablename to task (ColumnStatsTask which includes a BasicStatsTask) + Map map = new LinkedHashMap<>(); if (isCStats) { - genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadFileWork, leafTasks, outerQueryLimit, 0); + if (rootTasks == null || rootTasks.size() != 1 || pCtx.getTopOps() == null + || pCtx.getTopOps().size() != 1) { + throw new SemanticException("Can not find correct root task!"); + } + try { + Task root = rootTasks.iterator().next(); + StatsTask tsk = (StatsTask) genTableStats(pCtx, pCtx.getTopOps().values() + .iterator().next(), root, outputs); + root.addDependentTask(tsk); + map.put(extractTableFullName((StatsTask) tsk), (StatsTask) tsk); + } catch (HiveException e) { + throw new SemanticException(e); + } + genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadFileWork, map, outerQueryLimit, 0); } else { + Set> leafTasks = new LinkedHashSet>(); + getLeafTasks(rootTasks, leafTasks); + List> nonStatsLeafTasks = new ArrayList<>(); + for (Task tsk : leafTasks) { + // map table name to the correct ColumnStatsTask + if (tsk instanceof StatsTask) { + map.put(extractTableFullName((StatsTask) tsk), (StatsTask) tsk); + } else { + nonStatsLeafTasks.add(tsk); + } + } + // add cStatsTask as a dependent of all the nonStatsLeafTasks + for (Task tsk : nonStatsLeafTasks) { + for (Task cStatsTask : map.values()) { + tsk.addDependentTask(cStatsTask); + } + } for (ColumnStatsAutoGatherContext columnStatsAutoGatherContext : pCtx .getColumnStatsAutoGatherContexts()) { if (!columnStatsAutoGatherContext.isInsertInto()) { genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), - columnStatsAutoGatherContext.getLoadFileWork(), leafTasks, outerQueryLimit, 0); + columnStatsAutoGatherContext.getLoadFileWork(), map, outerQueryLimit, 0); } else { int numBitVector; try { @@ -314,7 +361,7 @@ public void compile(final ParseContext pCtx, final List genTableStats(ParseContext parseContext, TableScanOperator tableScan, Task currentTask, final HashSet outputs) + throws HiveException { + Class inputFormat = tableScan.getConf().getTableMetadata() + .getInputFormatClass(); + Table table = tableScan.getConf().getTableMetadata(); + List partitions = new ArrayList<>(); + if (table.isPartitioned()) { + partitions.addAll(parseContext.getPrunedPartitions(tableScan).getPartitions()); + for (Partition partn : partitions) { + LOG.debug("XXX: adding part: " + partn); + outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK)); + } + } + TableSpec tableSpec = new TableSpec(table, partitions); + tableScan.getConf().getTableMetadata().setTableSpec(tableSpec); + + if (inputFormat.equals(OrcInputFormat.class)) { + // For ORC, there is no Tez Job for table stats. + BasicStatsNoJobWork snjWork = new BasicStatsNoJobWork(tableScan.getConf().getTableMetadata() + .getTableSpec()); + snjWork.setStatsReliable(parseContext.getConf().getBoolVar( + HiveConf.ConfVars.HIVE_STATS_RELIABLE)); + // If partition is specified, get pruned partition list + if (partitions.size() > 0) { + snjWork.setPrunedPartitionList(parseContext.getPrunedPartitions(tableScan)); + } + StatsWork columnStatsWork = new StatsWork(snjWork); + return TaskFactory.get(columnStatsWork, parseContext.getConf()); + } else { + BasicStatsWork statsWork = new BasicStatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); + statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); + statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); + statsWork.setSourceTask(currentTask); + statsWork.setStatsReliable(parseContext.getConf().getBoolVar( + HiveConf.ConfVars.HIVE_STATS_RELIABLE)); + StatsWork columnStatsWork = new StatsWork(statsWork); + return TaskFactory.get(columnStatsWork, parseContext.getConf()); + } + } + private void patchUpAfterCTASorMaterializedView(final List> rootTasks, final HashSet outputs, Task createTask) { @@ -388,7 +496,8 @@ private void patchUpAfterCTASorMaterializedView(final List 0); for (Task task : leaves) { - if (task instanceof StatsTask) { + if (task instanceof StatsTask + && ((StatsTask) task).getWork().getBasicStatsWork() != null) { // StatsTask require table to already exist for (Task parentOfStatsTask : task.getParentTasks()) { parentOfStatsTask.addDependentTask(createTask); @@ -416,13 +525,12 @@ private void patchUpAfterCTASorMaterializedView(final List loadFileWork, Set> leafTasks, - int outerQueryLimit, int numBitVector) { - ColumnStatsTask cStatsTask = null; - ColumnStatsWork cStatsWork = null; + List loadFileWork, Map map, + int outerQueryLimit, int numBitVector) throws SemanticException { FetchWork fetch = null; String tableName = analyzeRewrite.getTableName(); List colName = analyzeRewrite.getColName(); @@ -450,10 +558,12 @@ protected void genColumnStatsTask(AnalyzeRewriteContext analyzeRewrite, ColumnStatsDesc cStatsDesc = new ColumnStatsDesc(tableName, colName, colType, isTblLevel, numBitVector); - cStatsWork = new ColumnStatsWork(fetch, cStatsDesc); - cStatsTask = (ColumnStatsTask) TaskFactory.get(cStatsWork, conf); - for (Task tsk : leafTasks) { - tsk.addDependentTask(cStatsTask); + StatsTask columnStatsTask = map.get(tableName); + if (columnStatsTask == null) { + throw new SemanticException("Can not find " + tableName + " in genColumnStatsTask"); + } else { + columnStatsTask.getWork().setfWork(fetch); + columnStatsTask.getWork().setColStats(cStatsDesc); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkProcessAnalyzeTable.java ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkProcessAnalyzeTable.java index a2876e1d4f..74162c20f5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkProcessAnalyzeTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkProcessAnalyzeTable.java @@ -42,10 +42,11 @@ import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.SparkWork; -import org.apache.hadoop.hive.ql.plan.StatsNoJobWork; -import org.apache.hadoop.hive.ql.plan.StatsWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork; +import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.mapred.InputFormat; import com.google.common.base.Preconditions; @@ -106,7 +107,7 @@ public Object process(Node nd, Stack stack, // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // There will not be any Spark job above this task - StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec()); + BasicStatsNoJobWork snjWork = new BasicStatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec()); snjWork.setStatsReliable(parseContext.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); // If partition is specified, get pruned partition list @@ -118,7 +119,7 @@ public Object process(Node nd, Stack stack, false); snjWork.setPrunedPartitionList(partList); } - Task snjTask = TaskFactory.get(snjWork, parseContext.getConf()); + Task snjTask = TaskFactory.get(snjWork, parseContext.getConf()); snjTask.setParentTasks(null); context.rootTasks.remove(context.currentTask); context.rootTasks.add(snjTask); @@ -129,26 +130,27 @@ public Object process(Node nd, Stack stack, // The plan consists of a simple SparkTask followed by a StatsTask. // The Spark task is just a simple TableScanOperator - StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); - statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); - statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); - statsWork.setSourceTask(context.currentTask); - statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); - Task statsTask = TaskFactory.get(statsWork, parseContext.getConf()); + BasicStatsWork basicStatsWork = new BasicStatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); + basicStatsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); + basicStatsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); + basicStatsWork.setSourceTask(context.currentTask); + basicStatsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE)); + StatsWork columnStatsWork = new StatsWork(basicStatsWork); + Task statsTask = TaskFactory.get(columnStatsWork, parseContext.getConf()); context.currentTask.addDependentTask(statsTask); // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // The plan consists of a StatsTask only. if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) { statsTask.setParentTasks(null); - statsWork.setNoScanAnalyzeCommand(true); + basicStatsWork.setNoScanAnalyzeCommand(true); context.rootTasks.remove(context.currentTask); context.rootTasks.add(statsTask); } // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) { - handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask); + handlePartialScanCommand(tableScan, parseContext, basicStatsWork, context, statsTask); } // NOTE: here we should use the new partition predicate pushdown API to get a list of pruned list, @@ -176,7 +178,7 @@ public Object process(Node nd, Stack stack, * It is composed of PartialScanTask followed by StatsTask. */ private void handlePartialScanCommand(TableScanOperator tableScan, ParseContext parseContext, - StatsWork statsWork, GenSparkProcContext context, Task statsTask) + BasicStatsWork statsWork, GenSparkProcContext context, Task statsTask) throws SemanticException { String aggregationKey = tableScan.getConf().getStatsAggPrefix(); StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey); diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/StatsNoJobWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/BasicStatsNoJobWork.java similarity index 83% rename from ql/src/java/org/apache/hadoop/hive/ql/plan/StatsNoJobWork.java rename to ql/src/java/org/apache/hadoop/hive/ql/plan/BasicStatsNoJobWork.java index 77c04f6c6e..db33ac400d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/StatsNoJobWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/BasicStatsNoJobWork.java @@ -22,29 +22,27 @@ import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; -import org.apache.hadoop.hive.ql.plan.Explain.Level; /** * Client-side stats aggregator task. */ -@Explain(displayName = "Stats-Aggr Operator", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) -public class StatsNoJobWork implements Serializable { +public class BasicStatsNoJobWork implements Serializable { private static final long serialVersionUID = 1L; private TableSpec tableSpecs; private boolean statsReliable; private PrunedPartitionList prunedPartitionList; - public StatsNoJobWork() { + public BasicStatsNoJobWork() { } - public StatsNoJobWork(TableSpec tableSpecs) { + public BasicStatsNoJobWork(TableSpec tableSpecs) { this.tableSpecs = tableSpecs; } - public StatsNoJobWork(boolean statsReliable) { + public BasicStatsNoJobWork(boolean statsReliable) { this.statsReliable = statsReliable; } @@ -67,4 +65,5 @@ public void setPrunedPartitionList(PrunedPartitionList prunedPartitionList) { public PrunedPartitionList getPrunedPartitionList() { return prunedPartitionList; } + } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/BasicStatsWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/BasicStatsWork.java new file mode 100644 index 0000000000..24dbff119a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/plan/BasicStatsWork.java @@ -0,0 +1,180 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.io.Serializable; + +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; +import org.apache.hadoop.hive.ql.plan.Explain.Level; + + +/** + * ConditionalStats. + * + */ +public class BasicStatsWork implements Serializable { + private static final long serialVersionUID = 1L; + + private TableSpec tableSpecs; // source table spec -- for TableScanOperator + private LoadTableDesc loadTableDesc; // same as MoveWork.loadTableDesc -- for FileSinkOperator + private LoadFileDesc loadFileDesc; // same as MoveWork.loadFileDesc -- for FileSinkOperator + private String aggKey; // aggregation key prefix + private boolean statsReliable; // are stats completely reliable + + // If stats aggregator is not present, clear the current aggregator stats. + // For eg. if a merge is being performed, stats already collected by aggregator (numrows etc.) + // are still valid. However, if a load file is being performed, the old stats collected by + // aggregator are not valid. It might be a good idea to clear them instead of leaving wrong + // and old stats. + // Since HIVE-12661, we maintain the old stats (although may be wrong) for CBO + // purpose. We use a flag COLUMN_STATS_ACCURATE to + // show the accuracy of the stats. + + private boolean clearAggregatorStats = false; + + private boolean noStatsAggregator = false; + + private boolean isNoScanAnalyzeCommand = false; + + private boolean isPartialScanAnalyzeCommand = false; + + // sourceTask for TS is not changed (currently) but that of FS might be changed + // by various optimizers (auto.convert.join, for example) + // so this is set by DriverContext in runtime + private transient Task sourceTask; + + private boolean isFollowedByColStats = false; + + // used by FS based stats collector + private String statsTmpDir; + + public BasicStatsWork() { + } + + public BasicStatsWork(TableSpec tableSpecs) { + this.tableSpecs = tableSpecs; + } + + public BasicStatsWork(LoadTableDesc loadTableDesc) { + this.loadTableDesc = loadTableDesc; + } + + public BasicStatsWork(LoadFileDesc loadFileDesc) { + this.loadFileDesc = loadFileDesc; + } + + public TableSpec getTableSpecs() { + return tableSpecs; + } + + public LoadTableDesc getLoadTableDesc() { + return loadTableDesc; + } + + public LoadFileDesc getLoadFileDesc() { + return loadFileDesc; + } + + public void setAggKey(String aggK) { + aggKey = aggK; + } + + @Explain(displayName = "Stats Aggregation Key Prefix", explainLevels = { Level.EXTENDED }) + public String getAggKey() { + return aggKey; + } + + public String getStatsTmpDir() { + return statsTmpDir; + } + + public void setStatsTmpDir(String statsTmpDir) { + this.statsTmpDir = statsTmpDir; + } + + public boolean getNoStatsAggregator() { + return noStatsAggregator; + } + + public void setNoStatsAggregator(boolean noStatsAggregator) { + this.noStatsAggregator = noStatsAggregator; + } + + public boolean isStatsReliable() { + return statsReliable; + } + + public void setStatsReliable(boolean statsReliable) { + this.statsReliable = statsReliable; + } + + public boolean isClearAggregatorStats() { + return clearAggregatorStats; + } + + public void setClearAggregatorStats(boolean clearAggregatorStats) { + this.clearAggregatorStats = clearAggregatorStats; + } + + /** + * @return the isNoScanAnalyzeCommand + */ + public boolean isNoScanAnalyzeCommand() { + return isNoScanAnalyzeCommand; + } + + /** + * @param isNoScanAnalyzeCommand the isNoScanAnalyzeCommand to set + */ + public void setNoScanAnalyzeCommand(boolean isNoScanAnalyzeCommand) { + this.isNoScanAnalyzeCommand = isNoScanAnalyzeCommand; + } + + /** + * @return the isPartialScanAnalyzeCommand + */ + public boolean isPartialScanAnalyzeCommand() { + return isPartialScanAnalyzeCommand; + } + + /** + * @param isPartialScanAnalyzeCommand the isPartialScanAnalyzeCommand to set + */ + public void setPartialScanAnalyzeCommand(boolean isPartialScanAnalyzeCommand) { + this.isPartialScanAnalyzeCommand = isPartialScanAnalyzeCommand; + } + + public Task getSourceTask() { + return sourceTask; + } + + public void setSourceTask(Task sourceTask) { + this.sourceTask = sourceTask; + } + + public boolean isFollowedByColStats() { + return isFollowedByColStats; + } + + public void setFollowedByColStats(boolean isFollowedByColStats) { + this.isFollowedByColStats = isFollowedByColStats; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java index 97f323f4b7..a756a29d8b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java @@ -30,6 +30,7 @@ private static final long serialVersionUID = 1L; private boolean isTblLevel; private int numBitVector; + private boolean needMerge; private String tableName; private List colName; private List colType; @@ -44,6 +45,7 @@ public ColumnStatsDesc(String tableName, List colName, List colT this.colType = colType; this.isTblLevel = isTblLevel; this.numBitVector = 0; + this.needMerge = false; } public ColumnStatsDesc(String tableName, List colName, @@ -53,6 +55,7 @@ public ColumnStatsDesc(String tableName, List colName, this.colType = colType; this.isTblLevel = isTblLevel; this.numBitVector = numBitVector; + this.needMerge = this.numBitVector != 0; } @Explain(displayName = "Table") @@ -99,4 +102,8 @@ public void setNumBitVector(int numBitVector) { this.numBitVector = numBitVector; } + public boolean isNeedMerge() { + return needMerge; + } + } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsWork.java deleted file mode 100644 index 76811b1a93..0000000000 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsWork.java +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.plan; - -import java.io.Serializable; - -import org.apache.hadoop.hive.ql.CompilationOpContext; -import org.apache.hadoop.hive.ql.exec.ListSinkOperator; -import org.apache.hadoop.hive.ql.plan.Explain.Level; - - -/** - * ColumnStats Work. - * - */ -@Explain(displayName = "Column Stats Work", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) -public class ColumnStatsWork implements Serializable { - private static final long serialVersionUID = 1L; - private FetchWork fWork; - private ColumnStatsDesc colStats; - private static final int LIMIT = -1; - - - public ColumnStatsWork() { - } - - public ColumnStatsWork(FetchWork work, ColumnStatsDesc colStats) { - this.fWork = work; - this.setColStats(colStats); - } - - @Override - public String toString() { - String ret; - ret = fWork.toString(); - return ret; - } - - public FetchWork getfWork() { - return fWork; - } - - public void setfWork(FetchWork fWork) { - this.fWork = fWork; - } - - @Explain(displayName = "Column Stats Desc") - public ColumnStatsDesc getColStats() { - return colStats; - } - - public void setColStats(ColumnStatsDesc colStats) { - this.colStats = colStats; - } - - public ListSinkOperator getSink() { - return fWork.getSink(); - } - - public void initializeForFetch(CompilationOpContext ctx) { - fWork.initializeForFetch(ctx); - } - - public int getLeastNumRows() { - return fWork.getLeastNumRows(); - } - - public static int getLimit() { - return LIMIT; - } - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java index a5050c5368..e6120077a7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/StatsWork.java @@ -20,151 +20,99 @@ import java.io.Serializable; -import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.ListSinkOperator; import org.apache.hadoop.hive.ql.plan.Explain.Level; - /** - * ConditionalStats. + * Stats Work, may include basic stats work and column stats desc * */ -@Explain(displayName = "Stats-Aggr Operator", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) +@Explain(displayName = "Stats Work", explainLevels = { Level.USER, Level.DEFAULT, + Level.EXTENDED }) public class StatsWork implements Serializable { private static final long serialVersionUID = 1L; - - private TableSpec tableSpecs; // source table spec -- for TableScanOperator - private LoadTableDesc loadTableDesc; // same as MoveWork.loadTableDesc -- for FileSinkOperator - private LoadFileDesc loadFileDesc; // same as MoveWork.loadFileDesc -- for FileSinkOperator - private String aggKey; // aggregation key prefix - private boolean statsReliable; // are stats completely reliable - - // If stats aggregator is not present, clear the current aggregator stats. - // For eg. if a merge is being performed, stats already collected by aggregator (numrows etc.) - // are still valid. However, if a load file is being performed, the old stats collected by - // aggregator are not valid. It might be a good idea to clear them instead of leaving wrong - // and old stats. - // Since HIVE-12661, we maintain the old stats (although may be wrong) for CBO - // purpose. We use a flag COLUMN_STATS_ACCURATE to - // show the accuracy of the stats. - - private boolean clearAggregatorStats = false; - - private boolean noStatsAggregator = false; - - private boolean isNoScanAnalyzeCommand = false; - - private boolean isPartialScanAnalyzeCommand = false; - - // sourceTask for TS is not changed (currently) but that of FS might be changed - // by various optimizers (auto.convert.join, for example) - // so this is set by DriverContext in runtime - private transient Task sourceTask; - - // used by FS based stats collector - private String statsTmpDir; + // this is for basic stats + private BasicStatsWork basicStatsWork; + private BasicStatsNoJobWork basicStatsNoJobWork; + private FetchWork fWork; + private ColumnStatsDesc colStats; + private static final int LIMIT = -1; public StatsWork() { } - public StatsWork(TableSpec tableSpecs) { - this.tableSpecs = tableSpecs; - } - - public StatsWork(LoadTableDesc loadTableDesc) { - this.loadTableDesc = loadTableDesc; - } - - public StatsWork(LoadFileDesc loadFileDesc) { - this.loadFileDesc = loadFileDesc; - } - - public TableSpec getTableSpecs() { - return tableSpecs; + public StatsWork(BasicStatsWork basicStatsWork) { + super(); + this.basicStatsWork = basicStatsWork; } - public LoadTableDesc getLoadTableDesc() { - return loadTableDesc; + public StatsWork(BasicStatsNoJobWork basicStatsNoJobWork) { + super(); + this.basicStatsNoJobWork = basicStatsNoJobWork; } - public LoadFileDesc getLoadFileDesc() { - return loadFileDesc; + public StatsWork(FetchWork work, ColumnStatsDesc colStats) { + this.fWork = work; + this.setColStats(colStats); } - public void setAggKey(String aggK) { - aggKey = aggK; + @Override + public String toString() { + String ret; + ret = fWork.toString(); + return ret; } - @Explain(displayName = "Stats Aggregation Key Prefix", explainLevels = { Level.EXTENDED }) - public String getAggKey() { - return aggKey; + public FetchWork getfWork() { + return fWork; } - public String getStatsTmpDir() { - return statsTmpDir; + public void setfWork(FetchWork fWork) { + this.fWork = fWork; } - public void setStatsTmpDir(String statsTmpDir) { - this.statsTmpDir = statsTmpDir; + @Explain(displayName = "Column Stats Desc") + public ColumnStatsDesc getColStats() { + return colStats; } - public boolean getNoStatsAggregator() { - return noStatsAggregator; + public void setColStats(ColumnStatsDesc colStats) { + this.colStats = colStats; } - public void setNoStatsAggregator(boolean noStatsAggregator) { - this.noStatsAggregator = noStatsAggregator; + public ListSinkOperator getSink() { + return fWork.getSink(); } - public boolean isStatsReliable() { - return statsReliable; + public void initializeForFetch(CompilationOpContext ctx) { + fWork.initializeForFetch(ctx); } - public void setStatsReliable(boolean statsReliable) { - this.statsReliable = statsReliable; + public int getLeastNumRows() { + return fWork.getLeastNumRows(); } - public boolean isClearAggregatorStats() { - return clearAggregatorStats; + public static int getLimit() { + return LIMIT; } - public void setClearAggregatorStats(boolean clearAggregatorStats) { - this.clearAggregatorStats = clearAggregatorStats; + @Explain(displayName = "Basic Stats Work", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) + public BasicStatsWork getBasicStatsWork() { + return basicStatsWork; } - /** - * @return the isNoScanAnalyzeCommand - */ - public boolean isNoScanAnalyzeCommand() { - return isNoScanAnalyzeCommand; + public void setBasicStatsWork(BasicStatsWork basicStatsWork) { + this.basicStatsWork = basicStatsWork; } - /** - * @param isNoScanAnalyzeCommand the isNoScanAnalyzeCommand to set - */ - public void setNoScanAnalyzeCommand(boolean isNoScanAnalyzeCommand) { - this.isNoScanAnalyzeCommand = isNoScanAnalyzeCommand; + @Explain(displayName = "Basic Stats NoJob Work", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) + public BasicStatsNoJobWork getBasicStatsNoJobWork() { + return basicStatsNoJobWork; } - /** - * @return the isPartialScanAnalyzeCommand - */ - public boolean isPartialScanAnalyzeCommand() { - return isPartialScanAnalyzeCommand; + public void setBasicStatsNoJobWork(BasicStatsNoJobWork basicStatsNoJobWork) { + this.basicStatsNoJobWork = basicStatsNoJobWork; } - /** - * @param isPartialScanAnalyzeCommand the isPartialScanAnalyzeCommand to set - */ - public void setPartialScanAnalyzeCommand(boolean isPartialScanAnalyzeCommand) { - this.isPartialScanAnalyzeCommand = isPartialScanAnalyzeCommand; - } - - public Task getSourceTask() { - return sourceTask; - } - - public void setSourceTask(Task sourceTask) { - this.sourceTask = sourceTask; - } } diff --git ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java index 4f1c7d8b1e..44ce74d8bb 100644 --- ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java +++ ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java @@ -853,4 +853,4 @@ public void testMoreBucketsThanReducers2() throws Exception { int[][] expected = {{0, -1},{0, -1}, {1, -1}, {1, -1}, {2, -1}, {2, -1}, {3, -1}, {3, -1}}; Assert.assertEquals(stringifyValues(expected), r); } -} \ No newline at end of file +} diff --git ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java index 21b4a2ce55..ad8d39244b 100644 --- ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java +++ ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java @@ -129,6 +129,7 @@ protected void setUpWithTableProperties(String tableProperties) throws Exception .setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); hiveConf.setBoolVar(HiveConf.ConfVars.MERGE_CARDINALITY_VIOLATION_CHECK, true); + hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSCOLAUTOGATHER, false); TxnDbUtil.setConfValues(hiveConf); TxnDbUtil.prepDb(); diff --git ql/src/test/org/apache/hadoop/hive/ql/TxnCommandsBaseForTests.java ql/src/test/org/apache/hadoop/hive/ql/TxnCommandsBaseForTests.java index ad2aac5f56..9062d924c4 100644 --- ql/src/test/org/apache/hadoop/hive/ql/TxnCommandsBaseForTests.java +++ ql/src/test/org/apache/hadoop/hive/ql/TxnCommandsBaseForTests.java @@ -64,6 +64,7 @@ void setUpInternal() throws Exception { .setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); hiveConf.setBoolVar(HiveConf.ConfVars.MERGE_CARDINALITY_VIOLATION_CHECK, true); + hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSCOLAUTOGATHER, false); TxnDbUtil.setConfValues(hiveConf); TxnDbUtil.prepDb(); File f = new File(getWarehouseDir()); diff --git ql/src/test/queries/clientpositive/autoColumnStats_1.q ql/src/test/queries/clientpositive/autoColumnStats_1.q index 7955b07233..cc32393e65 100644 --- ql/src/test/queries/clientpositive/autoColumnStats_1.q +++ ql/src/test/queries/clientpositive/autoColumnStats_1.q @@ -60,6 +60,8 @@ drop table nzhang_part14; create table if not exists nzhang_part14 (key string) partitioned by (value string); +desc formatted nzhang_part14; + insert overwrite table nzhang_part14 partition(value) select key, value from ( select * from (select 'k1' as key, cast(null as string) as value from src limit 2)a @@ -69,6 +71,8 @@ select key, value from ( select * from (select 'k3' as key, ' ' as value from src limit 2)c ) T; +desc formatted nzhang_part14 partition (value=' '); + explain select key from nzhang_part14; diff --git ql/src/test/queries/clientpositive/autoColumnStats_10.q ql/src/test/queries/clientpositive/autoColumnStats_10.q new file mode 100644 index 0000000000..bf166d8701 --- /dev/null +++ ql/src/test/queries/clientpositive/autoColumnStats_10.q @@ -0,0 +1,52 @@ +set hive.mapred.mode=nonstrict; +set hive.stats.column.autogather=true; + +drop table p; + +CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint); + +desc formatted p; + +insert into p values (1,22,333); + +desc formatted p; + +alter table p replace columns (insert_num int, c1 STRING, c2 STRING); + +desc formatted p; + +desc formatted p insert_num; +desc formatted p c1; + +insert into p values (2,11,111); + +desc formatted p; + +desc formatted p insert_num; +desc formatted p c1; + +set hive.stats.column.autogather=false; + +drop table p; + +CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint); + +desc formatted p; + +insert into p values (1,22,333); + +desc formatted p; + +alter table p replace columns (insert_num int, c1 STRING, c2 STRING); + +desc formatted p; + +desc formatted p insert_num; +desc formatted p c1; + +insert into p values (2,11,111); + +desc formatted p; + +desc formatted p insert_num; +desc formatted p c1; diff --git ql/src/test/queries/clientpositive/bucket_map_join_tez2.q ql/src/test/queries/clientpositive/bucket_map_join_tez2.q index 37989ecc9d..c600f639da 100644 --- ql/src/test/queries/clientpositive/bucket_map_join_tez2.q +++ ql/src/test/queries/clientpositive/bucket_map_join_tez2.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.strict.checks.bucketing=false; set hive.mapred.mode=nonstrict; diff --git ql/src/test/queries/clientpositive/bucket_num_reducers.q ql/src/test/queries/clientpositive/bucket_num_reducers.q index 06f334e833..5c5008eea7 100644 --- ql/src/test/queries/clientpositive/bucket_num_reducers.q +++ ql/src/test/queries/clientpositive/bucket_num_reducers.q @@ -1,4 +1,4 @@ -; +set hive.stats.column.autogather=false; set hive.exec.mode.local.auto=false; set mapred.reduce.tasks = 10; diff --git ql/src/test/queries/clientpositive/combine1.q ql/src/test/queries/clientpositive/combine1.q index 3bcb8b19c1..b300830884 100644 --- ql/src/test/queries/clientpositive/combine1.q +++ ql/src/test/queries/clientpositive/combine1.q @@ -7,6 +7,8 @@ set mapred.max.split.size=256; set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; +set hive.stats.column.autogather=false; + -- SORT_QUERY_RESULTS create table combine1_1(key string, value string) stored as textfile; diff --git ql/src/test/queries/clientpositive/correlationoptimizer5.q ql/src/test/queries/clientpositive/correlationoptimizer5.q index 45b8cb955d..002fb12e22 100644 --- ql/src/test/queries/clientpositive/correlationoptimizer5.q +++ ql/src/test/queries/clientpositive/correlationoptimizer5.q @@ -1,3 +1,5 @@ +set hive.stats.column.autogather=false; +-- Currently, a query with multiple FileSinkOperators are not supported. set hive.mapred.mode=nonstrict; CREATE TABLE T1(key INT, val STRING); LOAD DATA LOCAL INPATH '../../data/files/kv1.txt' INTO TABLE T1; diff --git ql/src/test/queries/clientpositive/encryption_insert_values.q ql/src/test/queries/clientpositive/encryption_insert_values.q index 2dd3e9ad1d..c8d1d519f3 100644 --- ql/src/test/queries/clientpositive/encryption_insert_values.q +++ ql/src/test/queries/clientpositive/encryption_insert_values.q @@ -1,4 +1,5 @@ -- SORT_QUERY_RESULTS; +set hive.stats.column.autogather=false; DROP TABLE IF EXISTS encrypted_table PURGE; CREATE TABLE encrypted_table (key INT, value STRING) LOCATION '${hiveconf:hive.metastore.warehouse.dir}/default/encrypted_table'; @@ -12,4 +13,4 @@ select * from encrypted_table; -- this checks that we've actually created temp table data under encrypted_table folder describe formatted values__tmp__table__1; -CRYPTO DELETE_KEY --keyName key_128; \ No newline at end of file +CRYPTO DELETE_KEY --keyName key_128; diff --git ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q index 4dcea1f7ce..7159ad5995 100644 --- ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q +++ ql/src/test/queries/clientpositive/encryption_join_with_different_encryption_keys.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; --SORT_QUERY_RESULTS -- Java JCE must be installed in order to hava a key length of 256 bits diff --git ql/src/test/queries/clientpositive/encryption_move_tbl.q ql/src/test/queries/clientpositive/encryption_move_tbl.q index 0b7771cc4a..8d865aa6e8 100644 --- ql/src/test/queries/clientpositive/encryption_move_tbl.q +++ ql/src/test/queries/clientpositive/encryption_move_tbl.q @@ -1,4 +1,5 @@ -- SORT_QUERY_RESULTS; +set hive.stats.column.autogather=false; -- we're setting this so that TestNegaiveCliDriver.vm doesn't stop processing after ALTER TABLE fails; diff --git ql/src/test/queries/clientpositive/exec_parallel_column_stats.q ql/src/test/queries/clientpositive/exec_parallel_column_stats.q index ceacc2442d..a89b707918 100644 --- ql/src/test/queries/clientpositive/exec_parallel_column_stats.q +++ ql/src/test/queries/clientpositive/exec_parallel_column_stats.q @@ -1,5 +1,7 @@ set hive.exec.parallel=true; -explain analyze table src compute statistics for columns; +create table t as select * from src; -analyze table src compute statistics for columns; \ No newline at end of file +explain analyze table t compute statistics for columns; + +analyze table t compute statistics for columns; diff --git ql/src/test/queries/clientpositive/groupby1.q ql/src/test/queries/clientpositive/groupby1.q index a8c9a8dcf8..cd3a12b44e 100755 --- ql/src/test/queries/clientpositive/groupby1.q +++ ql/src/test/queries/clientpositive/groupby1.q @@ -1,3 +1,5 @@ +-- due to testMTQueries1 +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.explain.user=false; set hive.map.aggr=false; diff --git ql/src/test/queries/clientpositive/groupby1_limit.q ql/src/test/queries/clientpositive/groupby1_limit.q index b8e389e511..6c40e19540 100644 --- ql/src/test/queries/clientpositive/groupby1_limit.q +++ ql/src/test/queries/clientpositive/groupby1_limit.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set mapred.reduce.tasks=31; diff --git ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q index 2b799f87eb..40976ee707 100644 --- ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q +++ ql/src/test/queries/clientpositive/groupby_multi_single_reducer.q @@ -1,3 +1,6 @@ +set hive.stats.column.autogather=false; +-- due to L137 in LimitPushDownOptimization Not safe to continue for RS-GBY-GBY-LIM kind of pipelines. See HIVE-10607 for more. + set hive.multigroupby.singlereducer=true; -- SORT_QUERY_RESULTS diff --git ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q index 9c19a8678c..ec34e7a555 100644 --- ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q +++ ql/src/test/queries/clientpositive/hybridgrace_hashjoin_1.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.explain.user=false; -- Hybrid Grace Hash Join diff --git ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q index 6809b721be..e4170283f3 100644 --- ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q +++ ql/src/test/queries/clientpositive/infer_bucket_sort_convert_join.q @@ -1,3 +1,12 @@ +set hive.stats.column.autogather=false; +-- sounds weird: +-- on master, when auto=true, hive.mapjoin.localtask.max.memory.usage will be 0.55 as there is a gby +-- L132 of LocalMapJoinProcFactory +-- when execute in CLI, hive.exec.submit.local.task.via.child is true and we can see the error +-- if set hive.exec.submit.local.task.via.child=false, we can see it. +-- with patch, we just merge the tasks. hive.exec.submit.local.task.via.child=false due to pom.xml setting +-- however, even after change it to true, it still fails. + set hive.mapred.mode=nonstrict; set hive.exec.infer.bucket.sort=true; set hive.exec.infer.bucket.sort.num.buckets.power.two=true; diff --git ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q index 6824c1c032..c0ddb8bce6 100644 --- ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q +++ ql/src/test/queries/clientpositive/infer_bucket_sort_reducers_power_two.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.exec.infer.bucket.sort=true; set hive.exec.infer.bucket.sort.num.buckets.power.two=true; diff --git ql/src/test/queries/clientpositive/input11_limit.q ql/src/test/queries/clientpositive/input11_limit.q index 052a72ee68..211c37adc5 100644 --- ql/src/test/queries/clientpositive/input11_limit.q +++ ql/src/test/queries/clientpositive/input11_limit.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; -- SORT_QUERY_RESULTS diff --git ql/src/test/queries/clientpositive/input14_limit.q ql/src/test/queries/clientpositive/input14_limit.q index 7316752a6d..2f6e4e47c9 100644 --- ql/src/test/queries/clientpositive/input14_limit.q +++ ql/src/test/queries/clientpositive/input14_limit.q @@ -1,3 +1,5 @@ +set hive.stats.column.autogather=false; + CREATE TABLE dest1(key INT, value STRING) STORED AS TEXTFILE; EXPLAIN diff --git ql/src/test/queries/clientpositive/join2.q ql/src/test/queries/clientpositive/join2.q index 8aedd561e2..c3c7c241e9 100644 --- ql/src/test/queries/clientpositive/join2.q +++ ql/src/test/queries/clientpositive/join2.q @@ -1,3 +1,5 @@ +-- due to testMTQueries1 +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; -- SORT_QUERY_RESULTS diff --git ql/src/test/queries/clientpositive/metadata_only_queries.q ql/src/test/queries/clientpositive/metadata_only_queries.q index 8581a46b2d..bcf320b0c5 100644 --- ql/src/test/queries/clientpositive/metadata_only_queries.q +++ ql/src/test/queries/clientpositive/metadata_only_queries.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.explain.user=false; set hive.compute.query.using.stats=true; diff --git ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q index 1af813e3ed..692c414354 100644 --- ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q +++ ql/src/test/queries/clientpositive/metadata_only_queries_with_filters.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.stats.dbclass=fs; set hive.compute.query.using.stats=true; set hive.explain.user=false; diff --git ql/src/test/queries/clientpositive/multiMapJoin1.q ql/src/test/queries/clientpositive/multiMapJoin1.q index 5c49b4c64f..6e16af4617 100644 --- ql/src/test/queries/clientpositive/multiMapJoin1.q +++ ql/src/test/queries/clientpositive/multiMapJoin1.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; set hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecutePrinter,org.apache.hadoop.hive.ql.hooks.PrintCompletedTasksHook; diff --git ql/src/test/queries/clientpositive/orc_wide_table.q ql/src/test/queries/clientpositive/orc_wide_table.q index 422a3c24b1..d2ec3857d0 100644 --- ql/src/test/queries/clientpositive/orc_wide_table.q +++ ql/src/test/queries/clientpositive/orc_wide_table.q @@ -1,4 +1,5 @@ set hive.mapred.mode=nonstrict; +set hive.stats.column.autogather=false; drop table if exists test_txt; drop table if exists test_orc; create table test_txt( diff --git ql/src/test/queries/clientpositive/partition_coltype_literals.q ql/src/test/queries/clientpositive/partition_coltype_literals.q index eb56b1a93d..8da4876b70 100644 --- ql/src/test/queries/clientpositive/partition_coltype_literals.q +++ ql/src/test/queries/clientpositive/partition_coltype_literals.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.compute.query.using.stats=false; drop table if exists partcoltypenum; create table partcoltypenum (key int, value string) partitioned by (tint tinyint, sint smallint, bint bigint); diff --git ql/src/test/queries/clientpositive/row__id.q ql/src/test/queries/clientpositive/row__id.q index d9cb7b0ff6..6aaa40f68f 100644 --- ql/src/test/queries/clientpositive/row__id.q +++ ql/src/test/queries/clientpositive/row__id.q @@ -1,3 +1,5 @@ +-- tid is flaky when compute column stats +set hive.stats.column.autogather=false; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; drop table if exists hello_acid; diff --git ql/src/test/queries/clientpositive/smb_join_partition_key.q ql/src/test/queries/clientpositive/smb_join_partition_key.q index 160bf5e36a..23027f8aa5 100644 --- ql/src/test/queries/clientpositive/smb_join_partition_key.q +++ ql/src/test/queries/clientpositive/smb_join_partition_key.q @@ -1,3 +1,5 @@ +--because p1 is decimal, in derby, when it retrieves partition with decimal, it will use partval = 100.0, rather than 100. As a result, the partition will not be found and it throws exception. +set hive.stats.column.autogather=false; set hive.mapred.mode=nonstrict; SET hive.enforce.sortmergebucketmapjoin=false; SET hive.auto.convert.sortmerge.join=true; diff --git ql/src/test/queries/clientpositive/udf_round_2.q ql/src/test/queries/clientpositive/udf_round_2.q index 43988c1225..38885a97d4 100644 --- ql/src/test/queries/clientpositive/udf_round_2.q +++ ql/src/test/queries/clientpositive/udf_round_2.q @@ -1,4 +1,5 @@ set hive.fetch.task.conversion=more; +set hive.stats.column.autogather=false; -- test for NaN (not-a-number) create table tstTbl1(n double); diff --git ql/src/test/queries/clientpositive/vector_groupby_rollup1.q ql/src/test/queries/clientpositive/vector_groupby_rollup1.q index e08f8b9393..c2441c6864 100644 --- ql/src/test/queries/clientpositive/vector_groupby_rollup1.q +++ ql/src/test/queries/clientpositive/vector_groupby_rollup1.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; SET hive.vectorized.execution.reduce.enabled=true; diff --git ql/src/test/queries/clientpositive/vector_multi_insert.q ql/src/test/queries/clientpositive/vector_multi_insert.q index c56ee1c4aa..e6bfb96794 100644 --- ql/src/test/queries/clientpositive/vector_multi_insert.q +++ ql/src/test/queries/clientpositive/vector_multi_insert.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; diff --git ql/src/test/queries/clientpositive/vector_udf_character_length.q ql/src/test/queries/clientpositive/vector_udf_character_length.q index 19a5260ddc..e49a091b34 100644 --- ql/src/test/queries/clientpositive/vector_udf_character_length.q +++ ql/src/test/queries/clientpositive/vector_udf_character_length.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; diff --git ql/src/test/queries/clientpositive/vector_udf_octet_length.q ql/src/test/queries/clientpositive/vector_udf_octet_length.q index 06a49852a2..af4c7c4a7f 100644 --- ql/src/test/queries/clientpositive/vector_udf_octet_length.q +++ ql/src/test/queries/clientpositive/vector_udf_octet_length.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; diff --git ql/src/test/queries/clientpositive/vector_varchar_4.q ql/src/test/queries/clientpositive/vector_varchar_4.q index 80f84d8b9f..b3402d0df2 100644 --- ql/src/test/queries/clientpositive/vector_varchar_4.q +++ ql/src/test/queries/clientpositive/vector_varchar_4.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; @@ -50,4 +51,4 @@ insert overwrite table varchar_lazy_binary_columnar select t, si, i, b, f, d, s -- insert overwrite table varchar_lazy_binary_columnar select t, si, i, b, f, d, s from vectortab2korc; --- select count(*) as cnt from varchar_lazy_binary_columnar group by vs order by cnt asc; \ No newline at end of file +-- select count(*) as cnt from varchar_lazy_binary_columnar group by vs order by cnt asc; diff --git ql/src/test/queries/clientpositive/vector_varchar_simple.q ql/src/test/queries/clientpositive/vector_varchar_simple.q index 6f753a748d..352ec3aebc 100644 --- ql/src/test/queries/clientpositive/vector_varchar_simple.q +++ ql/src/test/queries/clientpositive/vector_varchar_simple.q @@ -1,3 +1,4 @@ +set hive.stats.column.autogather=false; set hive.explain.user=false; SET hive.vectorized.execution.enabled=true; set hive.fetch.task.conversion=none; diff --git ql/src/test/results/clientpositive/autoColumnStats_1.q.out ql/src/test/results/clientpositive/autoColumnStats_1.q.out index 4cf6df18f9..5f78d88dc2 100644 --- ql/src/test/results/clientpositive/autoColumnStats_1.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_1.q.out @@ -382,6 +382,46 @@ POSTHOOK: query: create table if not exists nzhang_part14 (key string) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@nzhang_part14 +PREHOOK: query: desc formatted nzhang_part14 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@nzhang_part14 +POSTHOOK: query: desc formatted nzhang_part14 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@nzhang_part14 +# col_name data_type comment + +key string + +# Partition Information +# col_name data_type comment + +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 0 + numPartitions 0 + numRows 0 + rawDataSize 0 + totalSize 0 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 PREHOOK: query: insert overwrite table nzhang_part14 partition(value) select key, value from ( select * from (select 'k1' as key, cast(null as string) as value from src limit 2)a @@ -407,6 +447,44 @@ POSTHOOK: Output: default@nzhang_part14@value= POSTHOOK: Output: default@nzhang_part14@value=__HIVE_DEFAULT_PARTITION__ POSTHOOK: Lineage: nzhang_part14 PARTITION(value= ).key EXPRESSION [] POSTHOOK: Lineage: nzhang_part14 PARTITION(value=__HIVE_DEFAULT_PARTITION__).key EXPRESSION [] +PREHOOK: query: desc formatted nzhang_part14 partition (value=' ') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@nzhang_part14 +POSTHOOK: query: desc formatted nzhang_part14 partition (value=' ') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@nzhang_part14 +# col_name data_type comment + +key string + +# Partition Information +# col_name data_type comment + +value string + +# Detailed Partition Information +Partition Value: [ ] +Database: default +Table: nzhang_part14 +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} + numFiles 1 + numRows 2 + rawDataSize 4 + totalSize 6 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 PREHOOK: query: explain select key from nzhang_part14 PREHOOK: type: QUERY POSTHOOK: query: explain select key from nzhang_part14 @@ -477,11 +555,11 @@ STAGE PLANS: Processor Tree: TableScan alias: nzhang_part14 - Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: key (type: string) outputColumnNames: _col0 - Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: create table alter5 ( col1 string ) partitioned by (dt string) @@ -958,6 +1036,12 @@ Retention: 0 #### A masked pattern was here #### Table Type: MANAGED_TABLE Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 0 + numPartitions 0 + numRows 0 + rawDataSize 0 + totalSize 0 #### A masked pattern was here #### # Storage Information diff --git ql/src/test/results/clientpositive/autoColumnStats_10.q.out ql/src/test/results/clientpositive/autoColumnStats_10.q.out new file mode 100644 index 0000000000..6b730c2bfd --- /dev/null +++ ql/src/test/results/clientpositive/autoColumnStats_10.q.out @@ -0,0 +1,452 @@ +PREHOOK: query: drop table p +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table p +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@p +POSTHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c1\":\"true\",\"c2\":\"true\",\"insert_num\":\"true\"}} + numFiles 0 + numRows 0 + rawDataSize 0 + totalSize 0 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: insert into p values (1,22,333) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (1,22,333) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c1\":\"true\",\"c2\":\"true\",\"insert_num\":\"true\"}} + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@p +PREHOOK: Output: default@p +POSTHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@p +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +#### A masked pattern was here #### + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int 1 1 0 1 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: insert into p values (2,11,111) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (2,11,111) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +#### A masked pattern was here #### + numFiles 2 + numRows 2 + rawDataSize 16 + totalSize 18 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int 1 2 0 2 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: drop table p +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@p +PREHOOK: Output: default@p +POSTHOOK: query: drop table p +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@p +POSTHOOK: Output: default@p +PREHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@p +POSTHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c1\":\"true\",\"c2\":\"true\",\"insert_num\":\"true\"}} + numFiles 0 + numRows 0 + rawDataSize 0 + totalSize 0 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: insert into p values (1,22,333) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (1,22,333) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@p +PREHOOK: Output: default@p +POSTHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@p +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +#### A masked pattern was here #### + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +PREHOOK: query: insert into p values (2,11,111) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (2,11,111) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +#### A masked pattern was here #### + numFiles 2 + numRows 2 + rawDataSize 16 + totalSize 18 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} diff --git ql/src/test/results/clientpositive/autoColumnStats_2.q.out ql/src/test/results/clientpositive/autoColumnStats_2.q.out index 791e6ae2fd..169d0f04c3 100644 --- ql/src/test/results/clientpositive/autoColumnStats_2.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_2.q.out @@ -125,18 +125,20 @@ PREHOOK: Input: default@a POSTHOOK: query: describe formatted a key POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@a -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -key string 0 205 2.812 3 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +key string 0 309 2.812 3 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: describe formatted b key PREHOOK: type: DESCTABLE PREHOOK: Input: default@b POSTHOOK: query: describe formatted b key POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@b -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -key string 0 205 2.812 3 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +key string 0 309 2.812 3 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: from src insert overwrite table a select * insert into table b select * @@ -231,18 +233,20 @@ PREHOOK: Input: default@b POSTHOOK: query: describe formatted b key POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@b -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -key string 0 205 2.812 3 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +key string 0 309 2.812 3 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: describe formatted b value PREHOOK: type: DESCTABLE PREHOOK: Input: default@b POSTHOOK: query: describe formatted b value POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@b -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -value string 0 214 6.812 7 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +value string 0 309 6.812 7 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: insert into table b select NULL, NULL from src limit 10 PREHOOK: type: QUERY PREHOOK: Input: default@src @@ -251,26 +255,28 @@ POSTHOOK: query: insert into table b select NULL, NULL from src limit 10 POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: default@b -POSTHOOK: Lineage: b.key SIMPLE [] -POSTHOOK: Lineage: b.value SIMPLE [] +POSTHOOK: Lineage: b.key EXPRESSION [] +POSTHOOK: Lineage: b.value EXPRESSION [] PREHOOK: query: describe formatted b key PREHOOK: type: DESCTABLE PREHOOK: Input: default@b POSTHOOK: query: describe formatted b key POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@b -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -key string 10 205 2.812 3 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +key string 10 309 2.812 3 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: describe formatted b value PREHOOK: type: DESCTABLE PREHOOK: Input: default@b POSTHOOK: query: describe formatted b value POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@b -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -value string 10 214 6.812 7 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +value string 10 309 6.812 7 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: insert into table b(value) select key+100000 from src limit 10 PREHOOK: type: QUERY PREHOOK: Input: default@src @@ -287,18 +293,20 @@ PREHOOK: Input: default@b POSTHOOK: query: describe formatted b key POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@b -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -key string 20 205 2.812 3 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +key string 20 309 2.812 3 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: describe formatted b value PREHOOK: type: DESCTABLE PREHOOK: Input: default@b POSTHOOK: query: describe formatted b value POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@b -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment - -value string 10 214 8.0 8 from deserializer +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +value string 10 319 8.0 8 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} PREHOOK: query: drop table src_multi2 PREHOOK: type: DROPTABLE POSTHOOK: query: drop table src_multi2 @@ -467,11 +475,11 @@ STAGE PLANS: Processor Tree: TableScan alias: nzhang_part14 - Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: key (type: string) outputColumnNames: _col0 - Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11 Data size: 946 Basic stats: COMPLETE Column stats: PARTIAL ListSink PREHOOK: query: drop table alter5 @@ -514,7 +522,7 @@ Database: default Table: alter5 #### A masked pattern was here #### Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"col1\":\"true\"}} numFiles 0 numRows 0 rawDataSize 0 @@ -687,7 +695,6 @@ Database: default Table: alter5 #### A masked pattern was here #### Partition Parameters: - COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"col1\":\"true\"}} numFiles 1 totalSize 1906 #### A masked pattern was here #### @@ -1079,6 +1086,12 @@ Retention: 0 #### A masked pattern was here #### Table Type: MANAGED_TABLE Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 0 + numPartitions 0 + numRows 0 + rawDataSize 0 + totalSize 0 #### A masked pattern was here #### # Storage Information diff --git ql/src/test/results/clientpositive/autoColumnStats_3.q.out ql/src/test/results/clientpositive/autoColumnStats_3.q.out index 2f70095b7a..3a23c94b13 100644 --- ql/src/test/results/clientpositive/autoColumnStats_3.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_3.q.out @@ -13,10 +13,12 @@ POSTHOOK: Output: default@src_multi1 PREHOOK: query: analyze table src_multi1 compute statistics for columns key PREHOOK: type: QUERY PREHOOK: Input: default@src_multi1 +PREHOOK: Output: default@src_multi1 #### A masked pattern was here #### POSTHOOK: query: analyze table src_multi1 compute statistics for columns key POSTHOOK: type: QUERY POSTHOOK: Input: default@src_multi1 +POSTHOOK: Output: default@src_multi1 #### A masked pattern was here #### PREHOOK: query: describe formatted src_multi1 PREHOOK: type: DESCTABLE @@ -228,11 +230,15 @@ PREHOOK: query: analyze table nzhang_part14 partition(ds='1', hr='3') compute st PREHOOK: type: QUERY PREHOOK: Input: default@nzhang_part14 PREHOOK: Input: default@nzhang_part14@ds=1/hr=3 +PREHOOK: Output: default@nzhang_part14 +PREHOOK: Output: default@nzhang_part14@ds=1/hr=3 #### A masked pattern was here #### POSTHOOK: query: analyze table nzhang_part14 partition(ds='1', hr='3') compute statistics for columns value POSTHOOK: type: QUERY POSTHOOK: Input: default@nzhang_part14 POSTHOOK: Input: default@nzhang_part14@ds=1/hr=3 +POSTHOOK: Output: default@nzhang_part14 +POSTHOOK: Output: default@nzhang_part14@ds=1/hr=3 #### A masked pattern was here #### PREHOOK: query: desc formatted nzhang_part14 partition(ds='1', hr='3') PREHOOK: type: DESCTABLE @@ -367,7 +373,7 @@ Database: default Table: nzhang_part14 #### A masked pattern was here #### Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"value\":\"true\"}} numFiles 2 numRows 4 rawDataSize 12 @@ -407,7 +413,7 @@ Database: default Table: nzhang_part14 #### A masked pattern was here #### Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} numFiles 2 numRows 4 rawDataSize 16 diff --git ql/src/test/results/clientpositive/autoColumnStats_4.q.out ql/src/test/results/clientpositive/autoColumnStats_4.q.out index a0581f8d8a..183ef06d4a 100644 --- ql/src/test/results/clientpositive/autoColumnStats_4.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_4.q.out @@ -50,8 +50,7 @@ STAGE DEPENDENCIES: Stage-1 is a root stage Stage-2 depends on stages: Stage-1 Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 - Stage-5 depends on stages: Stage-3, Stage-4 + Stage-3 depends on stages: Stage-0, Stage-4 Stage-4 depends on stages: Stage-2 STAGE PLANS: @@ -138,10 +137,8 @@ STAGE PLANS: name: default.acid_dtt Stage: Stage-3 - Stats-Aggr Operator - - Stage: Stage-5 - Column Stats Work + Stats Work + Basic Stats Work: Column Stats Desc: Columns: a, b Column Types: int, varchar(128) @@ -197,7 +194,6 @@ Retention: 0 #### A masked pattern was here #### Table Type: MANAGED_TABLE Table Parameters: - COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\"}} numFiles 2 numRows 0 rawDataSize 0 diff --git ql/src/test/results/clientpositive/autoColumnStats_5.q.out ql/src/test/results/clientpositive/autoColumnStats_5.q.out index 1298d989a2..36a94df3ef 100644 --- ql/src/test/results/clientpositive/autoColumnStats_5.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_5.q.out @@ -17,7 +17,6 @@ STAGE DEPENDENCIES: Stage-4 Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 Stage-2 depends on stages: Stage-0 - Stage-8 depends on stages: Stage-2 Stage-3 Stage-5 Stage-6 depends on stages: Stage-5 @@ -98,10 +97,8 @@ STAGE PLANS: name: default.partitioned1 Stage: Stage-2 - Stats-Aggr Operator - - Stage: Stage-8 - Column Stats Work + Stats Work + Basic Stats Work: Column Stats Desc: Columns: a, b Column Types: int, string @@ -255,7 +252,6 @@ STAGE DEPENDENCIES: Stage-4 Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 Stage-2 depends on stages: Stage-0 - Stage-8 depends on stages: Stage-2 Stage-3 Stage-5 Stage-6 depends on stages: Stage-5 @@ -336,10 +332,8 @@ STAGE PLANS: name: default.partitioned1 Stage: Stage-2 - Stats-Aggr Operator - - Stage: Stage-8 - Column Stats Work + Stats Work + Basic Stats Work: Column Stats Desc: Columns: a, b, c, d Column Types: int, string, int, string @@ -449,7 +443,6 @@ STAGE DEPENDENCIES: Stage-4 Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 Stage-2 depends on stages: Stage-0 - Stage-8 depends on stages: Stage-2 Stage-3 Stage-5 Stage-6 depends on stages: Stage-5 @@ -530,10 +523,8 @@ STAGE PLANS: name: default.partitioned1 Stage: Stage-2 - Stats-Aggr Operator - - Stage: Stage-8 - Column Stats Work + Stats Work + Basic Stats Work: Column Stats Desc: Columns: a, b, c, d Column Types: int, string, int, string @@ -603,7 +594,7 @@ Database: default Table: partitioned1 #### A masked pattern was here #### Partition Parameters: - COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\",\"d\":\"true\"}} + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\"}} numFiles 2 numRows 6 rawDataSize 78 @@ -637,6 +628,6 @@ POSTHOOK: query: desc formatted partitioned1 partition(part=1) c POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@partitioned1 col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitvector -# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector +# col_name data_type comment -c int 100 200 0 2 HL from deserializer +c int from deserializer diff --git ql/src/test/results/clientpositive/autoColumnStats_6.q.out ql/src/test/results/clientpositive/autoColumnStats_6.q.out index c4ab489b39..70788fde9f 100644 --- ql/src/test/results/clientpositive/autoColumnStats_6.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_6.q.out @@ -30,7 +30,6 @@ STAGE DEPENDENCIES: Stage-4 Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 Stage-2 depends on stages: Stage-0 - Stage-8 depends on stages: Stage-2 Stage-3 Stage-5 Stage-6 depends on stages: Stage-5 @@ -113,10 +112,8 @@ STAGE PLANS: name: default.orcfile_merge2a Stage: Stage-2 - Stats-Aggr Operator - - Stage: Stage-8 - Column Stats Work + Stats Work + Basic Stats Work: Column Stats Desc: Columns: key, value Column Types: int, string diff --git ql/src/test/results/clientpositive/autoColumnStats_7.q.out ql/src/test/results/clientpositive/autoColumnStats_7.q.out index 2dc9fc2d42..acea69e484 100644 --- ql/src/test/results/clientpositive/autoColumnStats_7.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_7.q.out @@ -26,8 +26,7 @@ STAGE DEPENDENCIES: Stage-1 is a root stage Stage-2 depends on stages: Stage-1 Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 - Stage-6 depends on stages: Stage-3, Stage-5 + Stage-3 depends on stages: Stage-0, Stage-5 Stage-4 depends on stages: Stage-2 Stage-5 depends on stages: Stage-4 @@ -112,10 +111,8 @@ STAGE PLANS: name: default.dest_g2 Stage: Stage-3 - Stats-Aggr Operator - - Stage: Stage-6 - Column Stats Work + Stats Work + Basic Stats Work: Column Stats Desc: Columns: key, c1, c2 Column Types: string, int, string diff --git ql/src/test/results/clientpositive/autoColumnStats_8.q.out ql/src/test/results/clientpositive/autoColumnStats_8.q.out index c913d97fe5..3afd07a560 100644 --- ql/src/test/results/clientpositive/autoColumnStats_8.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_8.q.out @@ -48,10 +48,8 @@ STAGE DEPENDENCIES: Stage-2 is a root stage Stage-0 depends on stages: Stage-2 Stage-3 depends on stages: Stage-0 - Stage-6 depends on stages: Stage-3, Stage-4, Stage-5 - Stage-7 depends on stages: Stage-3, Stage-4, Stage-5 Stage-1 depends on stages: Stage-2 - Stage-4 depends on stages: Stage-1 + Stage-4 depends on stages: Stage-1, Stage-5 Stage-5 depends on stages: Stage-2 STAGE PLANS: @@ -448,25 +446,10 @@ STAGE PLANS: name: default.nzhang_part8 Stage: Stage-3 - Stats-Aggr Operator + Stats Work + Basic Stats Work: #### A masked pattern was here #### - Stage: Stage-6 - Column Stats Work - Column Stats Desc: - Columns: key, value - Column Types: string, string - Table: default.nzhang_part8 - Is Table Level Stats: false - - Stage: Stage-7 - Column Stats Work - Column Stats Desc: - Columns: key, value - Column Types: string, string - Table: default.nzhang_part8 - Is Table Level Stats: false - Stage: Stage-1 Move Operator tables: @@ -496,8 +479,14 @@ STAGE PLANS: name: default.nzhang_part8 Stage: Stage-4 - Stats-Aggr Operator + Stats Work + Basic Stats Work: #### A masked pattern was here #### + Column Stats Desc: + Columns: key, value + Column Types: string, string + Table: default.nzhang_part8 + Is Table Level Stats: false Stage: Stage-5 Map Reduce diff --git ql/src/test/results/clientpositive/autoColumnStats_9.q.out ql/src/test/results/clientpositive/autoColumnStats_9.q.out index fda71e0dd2..78b3316a11 100644 --- ql/src/test/results/clientpositive/autoColumnStats_9.q.out +++ ql/src/test/results/clientpositive/autoColumnStats_9.q.out @@ -20,8 +20,7 @@ STAGE DEPENDENCIES: Stage-7 Stage-5 depends on stages: Stage-7 Stage-0 depends on stages: Stage-5 - Stage-2 depends on stages: Stage-0 - Stage-8 depends on stages: Stage-2, Stage-3 + Stage-2 depends on stages: Stage-0, Stage-3 Stage-3 depends on stages: Stage-5 STAGE PLANS: @@ -166,10 +165,8 @@ STAGE PLANS: name: default.dest_j1 Stage: Stage-2 - Stats-Aggr Operator - - Stage: Stage-8 - Column Stats Work + Stats Work + Basic Stats Work: Column Stats Desc: Columns: key, value Column Types: int, string diff --git ql/src/test/results/clientpositive/llap/autoColumnStats_1.q.out ql/src/test/results/clientpositive/llap/autoColumnStats_1.q.out index 6369dd3477..2926cef4fa 100644 --- ql/src/test/results/clientpositive/llap/autoColumnStats_1.q.out +++ ql/src/test/results/clientpositive/llap/autoColumnStats_1.q.out @@ -380,6 +380,46 @@ POSTHOOK: query: create table if not exists nzhang_part14 (key string) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@nzhang_part14 +PREHOOK: query: desc formatted nzhang_part14 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@nzhang_part14 +POSTHOOK: query: desc formatted nzhang_part14 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@nzhang_part14 +# col_name data_type comment + +key string + +# Partition Information +# col_name data_type comment + +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 0 + numPartitions 0 + numRows 0 + rawDataSize 0 + totalSize 0 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 PREHOOK: query: insert overwrite table nzhang_part14 partition(value) select key, value from ( select * from (select 'k1' as key, cast(null as string) as value from src limit 2)a @@ -405,6 +445,44 @@ POSTHOOK: Output: default@nzhang_part14@value= POSTHOOK: Output: default@nzhang_part14@value=__HIVE_DEFAULT_PARTITION__ POSTHOOK: Lineage: nzhang_part14 PARTITION(value= ).key EXPRESSION [] POSTHOOK: Lineage: nzhang_part14 PARTITION(value=__HIVE_DEFAULT_PARTITION__).key EXPRESSION [] +PREHOOK: query: desc formatted nzhang_part14 partition (value=' ') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@nzhang_part14 +POSTHOOK: query: desc formatted nzhang_part14 partition (value=' ') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@nzhang_part14 +# col_name data_type comment + +key string + +# Partition Information +# col_name data_type comment + +value string + +# Detailed Partition Information +Partition Value: [ ] +Database: default +Table: nzhang_part14 +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\"}} + numFiles 1 + numRows 2 + rawDataSize 4 + totalSize 6 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 PREHOOK: query: explain select key from nzhang_part14 PREHOOK: type: QUERY POSTHOOK: query: explain select key from nzhang_part14 diff --git ql/src/test/results/clientpositive/llap/autoColumnStats_10.q.out ql/src/test/results/clientpositive/llap/autoColumnStats_10.q.out new file mode 100644 index 0000000000..6b730c2bfd --- /dev/null +++ ql/src/test/results/clientpositive/llap/autoColumnStats_10.q.out @@ -0,0 +1,452 @@ +PREHOOK: query: drop table p +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table p +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@p +POSTHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c1\":\"true\",\"c2\":\"true\",\"insert_num\":\"true\"}} + numFiles 0 + numRows 0 + rawDataSize 0 + totalSize 0 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: insert into p values (1,22,333) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (1,22,333) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c1\":\"true\",\"c2\":\"true\",\"insert_num\":\"true\"}} + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@p +PREHOOK: Output: default@p +POSTHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@p +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +#### A masked pattern was here #### + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int 1 1 0 1 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: insert into p values (2,11,111) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (2,11,111) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +#### A masked pattern was here #### + numFiles 2 + numRows 2 + rawDataSize 16 + totalSize 18 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int 1 2 0 2 HL from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"insert_num\":\"true\"}} +PREHOOK: query: drop table p +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@p +PREHOOK: Output: default@p +POSTHOOK: query: drop table p +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@p +POSTHOOK: Output: default@p +PREHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@p +POSTHOOK: query: CREATE TABLE p(insert_num int, c1 tinyint, c2 smallint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c1\":\"true\",\"c2\":\"true\",\"insert_num\":\"true\"}} + numFiles 0 + numRows 0 + rawDataSize 0 + totalSize 0 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: insert into p values (1,22,333) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (1,22,333) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 tinyint +c2 smallint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +PREHOOK: type: ALTERTABLE_REPLACECOLS +PREHOOK: Input: default@p +PREHOOK: Output: default@p +POSTHOOK: query: alter table p replace columns (insert_num int, c1 STRING, c2 STRING) +POSTHOOK: type: ALTERTABLE_REPLACECOLS +POSTHOOK: Input: default@p +POSTHOOK: Output: default@p +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +#### A masked pattern was here #### + numFiles 1 + numRows 1 + rawDataSize 8 + totalSize 9 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +PREHOOK: query: insert into p values (2,11,111) +PREHOOK: type: QUERY +PREHOOK: Output: default@p +POSTHOOK: query: insert into p values (2,11,111) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@p +POSTHOOK: Lineage: p.c1 SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: p.c2 SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: p.insert_num EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: desc formatted p +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type comment + +insert_num int +c1 string +c2 string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +#### A masked pattern was here #### + numFiles 2 + numRows 2 + rawDataSize 16 + totalSize 18 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted p insert_num +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p insert_num +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +insert_num int from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} +PREHOOK: query: desc formatted p c1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@p +POSTHOOK: query: desc formatted p c1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@p +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment bitVector + +c1 string from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} diff --git ql/src/test/results/clientpositive/llap/autoColumnStats_2.q.out ql/src/test/results/clientpositive/llap/autoColumnStats_2.q.out index a61eafadc9..25cd3756d5 100644 --- ql/src/test/results/clientpositive/llap/autoColumnStats_2.q.out +++ ql/src/test/results/clientpositive/llap/autoColumnStats_2.q.out @@ -687,7 +687,6 @@ Database: default Table: alter5 #### A masked pattern was here #### Partition Parameters: - COLUMN_STATS_ACCURATE {\"COLUMN_STATS\":{\"col1\":\"true\"}} numFiles 1 totalSize 1906 #### A masked pattern was here ####